Re_Backend/src/services/form16Ocr.service.ts

/**
 * Form 16A OCR: Google Gemini (primary) with regex fallback.
 * Supports (1) Gemini API key, (2) Vertex AI with service account, (3) regex fallback.
 */

import { GoogleGenerativeAI } from '@google/generative-ai';
import { VertexAI } from '@google-cloud/vertexai';
import * as fs from 'fs';
import * as path from 'path';
import dotenv from 'dotenv';

import logger from '../utils/logger';

// Ensure .env is loaded (backend may run from different cwd)
const backendDir = path.join(__dirname, '../..');
dotenv.config({ path: path.join(backendDir, '.env') });

function getForm16VertexKeyPath(): string | null {
  const keyFile = process.env.GCP_KEY_FILE?.trim();
  const creds = process.env.GOOGLE_APPLICATION_CREDENTIALS?.trim();
  if (keyFile) return path.isAbsolute(keyFile) ? keyFile : path.resolve(backendDir, keyFile);
  if (creds) return path.isAbsolute(creds) ? creds : path.resolve(backendDir, creds);
  const defaultPath = path.join(backendDir, 'credentials', 're-platform-workflow-dealer-3d5738fcc1f9.json');
  return fs.existsSync(defaultPath) ? defaultPath : null;
}

export interface Form16AExtractedData {
  nameAndAddressOfDeductor?: string | null;
  deductorName?: string | null;
  deductorAddress?: string | null;
  deductorPhone?: string | null;
  deductorEmail?: string | null;
  totalAmountPaid?: number | null;
  totalTaxDeducted?: number | null;
  totalTdsDeposited?: number | null;
  tanOfDeductor?: string | null;
  natureOfPayment?: string | null;
  transactionDate?: string | null;
  statusOfMatchingOltas?: string | null;
  dateOfBooking?: string | null;
  assessmentYear?: string | null;
  quarter?: string | null;
  form16aNumber?: string | null;
  financialYear?: string | null;
  certificateDate?: string | null;
  tanNumber?: string | null;
  tdsAmount?: number | null;
  totalAmount?: number | null;
}

export interface Form16OcrResult {
  success: boolean;
  data?: Form16AExtractedData;
  method?: 'gemini' | 'fallback';
  ocrProvider?: string;
  error?: string;
  message?: string;
}

const GEMINI_PROMPT = `You are an expert at extracting data from Indian Tax Form 16A certificates (TDS certificate under Section 203).

STEP 1 - Read the ENTIRE document: every table, every section, and every line. Form 16A has multiple parts: deductor details, deductee details, and one or more TABLES with payment/TDS figures.

STEP 2 - Extract these fields. For amounts, look in TABLES: find rows or columns with these labels and take the NUMBER in the same row/column (ignore ₹, Rs, commas):

1. nameAndAddressOfDeductor - "Name and address of the deductor". Full block in one string. Also extract: deductorName (person/entity name only), deductorAddress (street, city, state, PIN), deductorPhone, deductorEmail.

2. totalAmountPaid - In "Summary of payment" find "Total(Rs)" or "Amount paid/credited". The LARGE amount (e.g. 181968556.36). Not the TDS amount.

3. totalTaxDeducted - "Amount of Tax Deducted in respect of Deductee" in tax summary table. The TDS amount (e.g. 181969.00). Must be hundreds or more, NOT a single digit like 3.

4. totalTdsDeposited - "Amount of Tax Deposited / Remitted in respect of Deductee". Same as totalTaxDeducted if one total (e.g. 181969.00). NOT a page number.

5. tanOfDeductor - "TAN of the deductor" or "TAN". Must be exactly 10 characters: 4 uppercase letters + 5 digits + 1 letter (e.g. BLRH07660C). No spaces.

6. natureOfPayment - "Nature of payment" or "Section" or "Nature of Payment". Value is usually a section code like 194Q, 194A, 194I, or a short description. Extract that code or text.

7. transactionDate - "Transaction date" or "Date of payment" or "Period" end date. Format DD-MM-YYYY or DD/MM/YYYY.

8. statusOfMatchingOltas - "Status of matching with OLTAS" or "OLTAS". Single letter (F, O, M) or word like "Matched". Extract as shown.

9. dateOfBooking - "Date of booking" or "Date of deposit". DD-MM-YYYY or DD/MM/YYYY.

10. assessmentYear - "Assessment Year" or "AY" from the form header. Format YYYY-YY (e.g. 2025-26). This is the Form 16A assessment year.

11. quarter - "Quarter". Must be Q1, Q2, Q3, or Q4. If you see "Apr-Jun","Jul-Sep","Oct-Dec","Jan-Mar" or "Quarter 1" etc., convert to Q1, Q2, Q3, Q4.

12. form16aNumber - "Certificate Number" or "Certificate No" - the alphanumeric code (e.g. LTZKJZA, 12345). Do NOT return "Last" or "updated" or "on" (from "Last updated on"). Only the certificate ID. If unclear, return null.

13. financialYear - "Financial Year" or "FY". Format YYYY-YY. Can derive from Assessment Year (AY 2025-26 => FY 2024-25).

14. certificateDate - "Date of certificate" or "Last updated on". DD-MM-YYYY. Optional.

RULES:
- Scan every table in the document for amount and TDS figures. The totals are usually in the last row or a row labeled "Total".
- For amounts: output only a number (e.g. 128234.00), no currency symbol or commas.
- For form16aNumber: if the value is a single English word (e.g. contains, certificate, number, nil), return null.
- If a field is truly not in the document, set it to null.
- Return ONLY a single JSON object, no markdown, no \`\`\`, no explanation.

JSON format (use this exact structure):
{
  "nameAndAddressOfDeductor": "string or null",
  "deductorName": "string or null",
  "deductorAddress": "string or null",
  "deductorPhone": "string or null",
  "deductorEmail": "string or null",
  "totalAmountPaid": number or null,
  "totalTaxDeducted": number or null,
  "totalTdsDeposited": number or null,
  "tanOfDeductor": "string or null",
  "natureOfPayment": "string or null",
  "transactionDate": "string or null",
  "statusOfMatchingOltas": "string or null",
  "dateOfBooking": "string or null",
  "assessmentYear": "string or null",
  "quarter": "string or null",
  "form16aNumber": "string or null",
  "financialYear": "string or null",
  "certificateDate": "string or null"
}`;

// ----- Helpers (aligned with REform16) -----
function getNum(v: unknown): number | null {
  if (v == null || v === '') return null;
  if (typeof v === 'number' && !Number.isNaN(v)) return v;
  const s = String(v).replace(/,/g, '').replace(/₹|Rs\.?|INR/gi, '').trim();
  const n = parseFloat(s);
  return !Number.isNaN(n) ? n : null;
}

function getStr(v: unknown): string | null {
  if (v != null && String(v).trim() !== '') return String(v).trim();
  return null;
}

function parseDeductorBlock(block: string | null): { name: string | null; address: string | null; phone: string | null; email: string | null } {
  const result = { name: null as string | null, address: null as string | null, phone: null as string | null, email: null as string | null };
  if (!block || typeof block !== 'string') return result;
  const parts = block.split(/[,]+/).map((p) => p.trim()).filter(Boolean);
  const emailPart = parts.find((p) => /@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/.test(p));
  const phonePart = parts.find((p) => /^[\+]?[(]?[0-9\s\-()]{10,}$/.test(p) || /^\+?91[\s\-]?\d{10}$/.test(p));
  if (emailPart) {
    result.email = emailPart;
    parts.splice(parts.indexOf(emailPart), 1);
  }
  if (phonePart) {
    result.phone = phonePart;
    parts.splice(parts.indexOf(phonePart), 1);
  }
  if (parts.length > 0) {
    result.name = parts[0];
    if (parts.length > 1) result.address = parts.slice(1).join(', ');
  }
  return result;
}

function isValidTdsAmount(n: number | null): boolean {
  if (n == null || Number.isNaN(n) || n < 0) return false;
  if (n >= 100) return true;
  if (Number.isInteger(n) && n < 100) return false;
  return true;
}

function extractTotalAmountPaidForm16A(text: string): number | null {
  const m = text.match(/Summary\s+of\s+payment[\s\S]*?Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i);
  if (m?.[1]) {
    const n = parseFloat(m[1].replace(/,/g, ''));
    if (!Number.isNaN(n) && n > 0) return n;
  }
  const m2 = text.match(/Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i);
  if (m2?.[1]) {
    const n = parseFloat(m2[1].replace(/,/g, ''));
    if (!Number.isNaN(n) && n > 0) return n;
  }
  return null;
}

function extractTDSAmountsForm16A(text: string): { taxDeducted: number | null; taxDeposited: number | null; totalRs: number | null } {
  const result = { taxDeducted: null as number | null, taxDeposited: null as number | null, totalRs: null as number | null };
  const quarterLine = text.match(/Q[1-4][A-Z0-9]*([0-9,]+\.\d{2})([0-9,]+\.\d{2})/);
  if (quarterLine?.[1] && quarterLine?.[2]) {
    const a1 = parseFloat(quarterLine[1].replace(/,/g, ''));
    const a2 = parseFloat(quarterLine[2].replace(/,/g, ''));
    if (isValidTdsAmount(a1)) result.taxDeducted = a1;
    if (isValidTdsAmount(a2)) result.taxDeposited = a2;
  }
  const taxDeductedM = text.match(/Amount\s+of\s+Tax\s+Deducted[\s\S]*?([0-9,]{3,}\.?[0-9]*|[0-9,]+\.\d{2})\s*(?:Amount|Deductee|$)/i);
  if (taxDeductedM?.[1] && !result.taxDeducted) {
    const n = parseFloat(taxDeductedM[1].replace(/,/g, ''));
    if (isValidTdsAmount(n)) result.taxDeducted = n;
  }
  const taxDepositedM = text.match(/Amount\s+of\s+Tax\s+Deposited[\s\S]*?([0-9,]{3,}\.?[0-9]*|[0-9,]+\.\d{2})/i);
  if (taxDepositedM?.[1] && !result.taxDeposited) {
    const n = parseFloat(taxDepositedM[1].replace(/,/g, ''));
    if (isValidTdsAmount(n)) result.taxDeposited = n;
  }
  const totalRsM = text.match(/Status\s+of\s+matching[\s\S]*?Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i)
    || text.match(/OLTAS[\s\S]*?Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i);
  if (totalRsM?.[1]) {
    const n = parseFloat(totalRsM[1].replace(/,/g, ''));
    if (isValidTdsAmount(n)) result.totalRs = n;
  }
  const rsDeductedM = text.match(/sum\s+of\s+Rs\.?\s*([0-9,]+\.?[0-9]*)\s*\[?Rs\.?\s*[Oo]ne\s+[Ll]akh/i);
  if (rsDeductedM?.[1] && !result.taxDeducted) {
    const n = parseFloat(rsDeductedM[1].replace(/,/g, ''));
    if (isValidTdsAmount(n)) result.taxDeducted = n;
  }
  const rsDepositedM = text.match(/deposited\s+[a-z\s]+Rs\.?\s*([0-9,]+\.?[0-9]*)\s*\[?Rs\.?\s*[Oo]ne/i);
  if (rsDepositedM?.[1] && !result.taxDeposited) {
    const n = parseFloat(rsDepositedM[1].replace(/,/g, ''));
    if (isValidTdsAmount(n)) result.taxDeposited = n;
  }
  if (result.taxDeducted != null && !isValidTdsAmount(result.taxDeducted)) result.taxDeducted = null;
  if (result.taxDeposited != null && !isValidTdsAmount(result.taxDeposited)) result.taxDeposited = null;
  if (result.totalRs != null && !isValidTdsAmount(result.totalRs)) result.totalRs = null;
  return result;
}

function extractQuarterForm16A(text: string): string | null {
  const m = text.match(/\bQuarter\s*[:\s]*\n?\s*(Q[1-4])/i) || text.match(/\b(Q[1-4])[A-Z0-9]*\s*[0-9]/);
  if (m?.[1]) return m[1].toUpperCase();
  return extractQuarter(text);
}

function extractQuarter(text: string): string | null {
  const patterns = [
    /Quarter[:\s]*([1-4])/i,
    /Q[:\s]*([1-4])/i,
    /([1-4])\s*Quarter/i,
  ];
  for (const pattern of patterns) {
    const m = text.match(pattern);
    if (m?.[1]) return `Q${m[1]}`;
  }
  const dateMatch = text.match(/(Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|September|Oct|October|Nov|November|Dec|December|Jan|January|Feb|February|Mar|March)/i);
  if (dateMatch) {
    const month = dateMatch[1].toLowerCase();
    if (['apr', 'april', 'may', 'jun', 'june'].includes(month)) return 'Q1';
    if (['jul', 'july', 'aug', 'august', 'sep', 'september'].includes(month)) return 'Q2';
    if (['oct', 'october', 'nov', 'november', 'dec', 'december'].includes(month)) return 'Q3';
    if (['jan', 'january', 'feb', 'february', 'mar', 'march'].includes(month)) return 'Q4';
  }
  return null;
}

function extractNatureOfPayment(text: string): string | null {
  const m = text.match(/\.(\d{2})\s*(19[4-9][A-Z]?|20[0-6][A-Z]?)\s*\d{2}-/);
  if (m?.[2]) return m[2];
  const m2 = text.match(/Nature\s+of\s+payment[\s\S]*?(19[4-9][A-Z]?|20[0-6][A-Z]?)/i);
  if (m2?.[1]) return m2[1];
  const m3 = text.match(/\b(19[4-9][A-Z]|20[0-6][A-Z])\b/);
  return m3?.[1] ?? null;
}

function extractTransactionDate(text: string): string | null {
  const m = text.match(/Period\s*From\s*(\d{1,2}-[A-Za-z]{3}-\d{4})\s*To\s*(\d{1,2}-[A-Za-z]{3}-\d{4})/i);
  if (m?.[2]) return m[2];
  const d = text.match(/(\d{1,2}-\d{1,2}-\d{4})/g);
  return d?.[0] ?? null;
}

function extractOltasStatus(text: string): string | null {
  const m = text.match(/Status\s+of\s+matching\s+with\s+OLTAS[\s\S]*?(\d{2}-\d{2}-\d{4})\s*(\d+)\s*([FOMUP])/i)
    || text.match(/([FOMUP])\s*Final\s*|([FOMUP])\s*Unmatched/i)
    || text.match(/\d{2}-\d{2}-\d{4}\s*\d+\s*([FOMUP])/);
  if (m) return (m[3] || m[1] || m[2] || '').toUpperCase();
  return null;
}

function extractDateOfBooking(text: string): string | null {
  const m = text.match(/Date\s+on\s+which\s+tax\s+deposited[\s\S]*?(\d{1,2}-\d{1,2}-\d{4})/i)
    || text.match(/Challan[\s\S]*?(\d{1,2}-\d{1,2}-\d{4})/i);
  return m?.[1] ?? null;
}

function extractForm16ANumber(text: string): string | null {
  const invalidWords = ['contains', 'certificate', 'number', 'nil', 'na', 'n/a', 'none', 'last'];
  const patterns = [
    /Certificate\s*No\.?[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i,
    /Form\s*16A\s*No\.?[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i,
    /Form\s*16A[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i,
    /Certificate\s*number[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i,
  ];
  for (const pattern of patterns) {
    const match = text.match(pattern);
    if (match?.[1]) {
      const val = match[1].trim();
      if (invalidWords.includes(val.toLowerCase()) || val.length < 3) continue;
      if (/\d/.test(val)) return val;
      if (/^[A-Z0-9\-]{3,30}$/i.test(val)) return val;
    }
  }
  return null;
}

function extractTAN(text: string): string | null {
  const patterns = [
    /TAN[:\s]*([A-Z]{4}[0-9]{5}[A-Z]{1})/i,
    /Tax\s*Deduction\s*Account\s*Number[:\s]*([A-Z]{4}[0-9]{5}[A-Z]{1})/i,
    /([A-Z]{4}[0-9]{5}[A-Z]{1})/g,
  ];
  for (const pattern of patterns) {
    const match = text.match(pattern);
    if (match?.[1]) return match[1].trim().toUpperCase();
  }
  return null;
}

function extractDeductorName(text: string): string | null {
  const patterns = [
    /Deductor[:\s]*([A-Z][A-Za-z\s&.,]+)/i,
    /Name\s*of\s*Deductor[:\s]*([A-Z][A-Za-z\s&.,]+)/i,
    /Company\s*Name[:\s]*([A-Z][A-Za-z\s&.,]+)/i,
  ];
  for (const pattern of patterns) {
    const match = text.match(pattern);
    if (match?.[1]) return match[1].trim();
  }
  return null;
}

function extractFinancialYear(text: string): string | null {
  const patterns = [
    /Financial\s*Year[:\s]*([0-9]{4}[-/][0-9]{2,4})/i,
    /FY[:\s]*([0-9]{4}[-/][0-9]{2,4})/i,
    /([0-9]{4}[-/][0-9]{2,4})/,
  ];
  for (const pattern of patterns) {
    const match = text.match(pattern);
    if (match?.[1]) return match[1].trim();
  }
  return null;
}

function extractAssessmentYear(text: string): string | null {
  const fyMatch = extractFinancialYear(text);
  if (fyMatch) {
    const parts = fyMatch.split(/[-/]/);
    if (parts.length === 2) {
      const startYear = parseInt(parts[0], 10);
      return `${startYear + 1}-${(startYear + 2).toString().slice(-2)}`;
    }
  }
  const patterns = [
    /Assessment\s*Year[:\s]*([0-9]{4}[-/][0-9]{2,4})/i,
    /AY[:\s]*([0-9]{4}[-/][0-9]{2,4})/i,
  ];
  for (const pattern of patterns) {
    const match = text.match(pattern);
    if (match?.[1]) return match[1].trim();
  }
  return null;
}

function extractCertificateDate(text: string): string | null {
  const patterns = [
    /Certificate\s*Date[:\s]*([0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{4})/i,
    /Date[:\s]*([0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{4})/i,
    /Issued\s*on[:\s]*([0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{4})/i,
  ];
  for (const pattern of patterns) {
    const match = text.match(pattern);
    if (match?.[1]) return match[1].trim();
  }
  return null;
}

/** Parse Form 16A raw text (REform16-aligned). */
function parseForm16ARawText(text: string): Form16AExtractedData {
  const lines = text.split(/\r?\n/).map((l) => l.trim()).filter(Boolean);
  const fullText = lines.join('\n');

  let nameAndAddressOfDeductor: string | null = null;
  const deductorStart = fullText.search(/Name\s+and\s+address\s+of\s+the\s+deductor/i);
  const deductorEnd = fullText.search(/Name\s+and\s+address\s+of\s+the\s+deductee|PAN\s+of\s+the\s+deductor/i);
  if (deductorStart !== -1 && deductorEnd !== -1 && deductorEnd > deductorStart) {
    const block = fullText.slice(deductorStart, deductorEnd);
    const afterLabel = block.replace(/Name\s+and\s+address\s+of\s+the\s+deductor\s*/i, '').trim();
    nameAndAddressOfDeductor = afterLabel.split(/\n/).map((l) => l.trim()).filter(Boolean).join(', ') || null;
  }
  if (!nameAndAddressOfDeductor) nameAndAddressOfDeductor = extractDeductorName(fullText);

  const tanOfDeductor = extractTAN(fullText);
  const totalAmountPaid = extractTotalAmountPaidForm16A(fullText);
  const tdsAmounts = extractTDSAmountsForm16A(fullText);
  const totalTaxDeducted = tdsAmounts.taxDeducted ?? tdsAmounts.totalRs ?? null;
  const totalTdsDeposited = tdsAmounts.taxDeposited ?? tdsAmounts.totalRs ?? totalTaxDeducted ?? null;
  const form16aNumber = extractForm16ANumber(fullText);
  const assessmentYear = extractAssessmentYear(fullText);
  const quarter = extractQuarterForm16A(fullText);
  const natureOfPayment = extractNatureOfPayment(fullText);
  const transactionDate = extractTransactionDate(fullText);
  const statusOfMatchingOltas = extractOltasStatus(fullText);
  const certificateDate = extractCertificateDate(fullText);
  const dateOfBooking = extractDateOfBooking(fullText);
  let financialYear = extractFinancialYear(fullText);
  if (!financialYear && assessmentYear) {
    const parts = assessmentYear.split(/[-/]/).map((p) => parseInt(p, 10));
    if (parts.length === 2 && !Number.isNaN(parts[1])) {
      financialYear = `${parts[0] - 1}-${String(parts[1] - 1).padStart(2, '0')}`;
    }
  }

  const parsedDeductor = parseDeductorBlock(nameAndAddressOfDeductor || '');
  return {
    nameAndAddressOfDeductor,
    deductorName: parsedDeductor.name || nameAndAddressOfDeductor,
    deductorAddress: parsedDeductor.address ?? null,
    deductorPhone: parsedDeductor.phone ?? null,
    deductorEmail: parsedDeductor.email ?? null,
    totalAmountPaid: totalAmountPaid ?? null,
    totalTaxDeducted: totalTaxDeducted ?? null,
    totalTdsDeposited: totalTdsDeposited ?? null,
    tanOfDeductor,
    natureOfPayment: natureOfPayment ?? null,
    transactionDate: transactionDate ?? null,
    statusOfMatchingOltas: statusOfMatchingOltas ?? null,
    dateOfBooking: dateOfBooking ?? null,
    assessmentYear: assessmentYear ?? null,
    quarter: quarter ?? null,
    form16aNumber,
    financialYear: financialYear ?? null,
    certificateDate,
    tanNumber: tanOfDeductor,
    tdsAmount: totalTaxDeducted ?? null,
    totalAmount: totalAmountPaid ?? null,
  };
}

/** Fallback: pdf-parse (v2 PDFParse API) + parseForm16ARawText (REform16-aligned). */
async function fallbackExtraction(filePath: string): Promise<Form16OcrResult> {
  try {
    const dataBuffer = fs.readFileSync(filePath);
    const { PDFParse } = await import('pdf-parse');
    const parser = new PDFParse({ data: new Uint8Array(dataBuffer) });
    const textResult = await parser.getText();
    const text = textResult?.text ?? '';
    await parser.destroy();
    if (!text || typeof text !== 'string') {
      logger.warn('[Form16 OCR] Fallback: no text extracted from PDF');
      return { success: false, message: 'No text could be extracted from PDF', error: 'Empty PDF text' };
    }
    const extracted = parseForm16ARawText(text);
    if (extracted.tanOfDeductor && !/^[A-Z]{4}[0-9]{5}[A-Z]{1}$/.test(extracted.tanOfDeductor)) {
      extracted.tanOfDeductor = null;
      extracted.tanNumber = null;
    }
    if (extracted.quarter) {
      const q = extracted.quarter.toUpperCase().trim();
      const qMatch = q.match(/[Q]?([1-4])/);
      extracted.quarter = /^Q[1-4]$/.test(q) ? q : (qMatch ? `Q${qMatch[1]}` : null);
    }
    logger.info('[Form16 OCR] Fallback extraction completed');
    return {
      success: true,
      data: extracted,
      method: 'fallback',
      ocrProvider: 'Regex fallback',
    };
  } catch (error: unknown) {
    const errMsg = error instanceof Error ? error.message : String(error);
    logger.error('[Form16 OCR] Fallback extraction error:', error);
    return {
      success: false,
      error: errMsg,
      message: 'Failed to extract data from PDF',
    };
  }
}

function sanitizeAndCleanGeminiData(extracted: Record<string, unknown>): Form16AExtractedData {
  const invalidCertWords = ['contains', 'certificate', 'number', 'nil', 'na', 'n/a', 'none', 'not', 'mentioned', 'see', 'above', 'below', 'refer', 'document', 'form', 'the', 'a', 'an', 'last'];
  const isInvalidCertNumber = (s: string | null): boolean => {
    if (!s || s.length > 50) return true;
    const lower = s.toLowerCase();
    if (invalidCertWords.some((w) => lower === w || lower.startsWith(w + ' ') || lower.endsWith(' ' + w))) return true;
    if (/\d/.test(s)) return false;
    if (/^[A-Z0-9\-]{3,30}$/i.test(s)) return false;
    return true;
  };
  const rawCertNo = getStr(extracted.form16aNumber);
  const form16aNumber = rawCertNo && !isInvalidCertNumber(rawCertNo) ? rawCertNo : null;

  const sanitizeTds = (n: number | null): number | null => {
    if (n == null || n < 0) return null;
    if (n >= 100) return n;
    if (Number.isInteger(n) && n < 100) return null;
    return n;
  };
  const rawTdsDeducted = getNum(extracted.totalTaxDeducted ?? extracted.tdsAmount);
  const rawTdsDeposited = getNum(extracted.totalTdsDeposited ?? extracted.tdsAmount);
  const safeTdsDeducted = sanitizeTds(rawTdsDeducted);
  const safeTdsDeposited = sanitizeTds(rawTdsDeposited);

  const deductorBlock = getStr(extracted.nameAndAddressOfDeductor ?? extracted.deductorName);
  const parsedDeductor = parseDeductorBlock(deductorBlock);
  const tanStr = getStr(extracted.tanOfDeductor ?? extracted.tanNumber);
  let tanUpper: string | null = tanStr ? tanStr.toUpperCase().trim() : null;
  if (tanUpper && !/^[A-Z]{4}[0-9]{5}[A-Z]{1}$/.test(tanUpper)) {
    tanUpper = null;
  }
  const quarterRaw = getStr(extracted.quarter);
  let quarter: string | null = null;
  if (quarterRaw) {
    const q = quarterRaw.toUpperCase().trim();
    if (/^Q[1-4]$/.test(q)) quarter = q;
    else {
      const m = q.match(/[Q]?([1-4])/);
      if (m) quarter = `Q${m[1]}`;
    }
  }

  return {
    nameAndAddressOfDeductor: deductorBlock,
    deductorName: getStr(extracted.deductorName ?? parsedDeductor.name) || deductorBlock,
    deductorAddress: getStr(extracted.deductorAddress ?? parsedDeductor.address),
    deductorPhone: getStr(extracted.deductorPhone ?? parsedDeductor.phone),
    deductorEmail: getStr(extracted.deductorEmail ?? parsedDeductor.email),
    totalAmountPaid: getNum(extracted.totalAmountPaid ?? extracted.totalAmount),
    totalTaxDeducted: safeTdsDeducted,
    totalTdsDeposited: safeTdsDeposited,
    tanOfDeductor: tanUpper,
    natureOfPayment: getStr(extracted.natureOfPayment),
    transactionDate: getStr(extracted.transactionDate),
    statusOfMatchingOltas: getStr(extracted.statusOfMatchingOltas),
    dateOfBooking: getStr(extracted.dateOfBooking),
    assessmentYear: getStr(extracted.assessmentYear),
    quarter,
    form16aNumber,
    financialYear: getStr(extracted.financialYear),
    certificateDate: getStr(extracted.certificateDate),
    tanNumber: tanUpper,
    tdsAmount: safeTdsDeducted,
    totalAmount: getNum(extracted.totalAmountPaid ?? extracted.totalAmount),
  };
}

/** Run Form 16A extraction via Vertex AI (service account). */
async function extractWithVertexAI(filePath: string, fileBase64: string, mimeType: string): Promise<Form16OcrResult> {
  const projectId = process.env.GCP_PROJECT_ID?.trim() || 're-platform-workflow-dealer';
  const location = process.env.FORM16_VERTEX_LOCATION?.trim() || process.env.VERTEX_AI_LOCATION?.trim() || 'us-central1';
  const modelId = process.env.GEMINI_MODEL?.trim() || 'gemini-2.0-flash-lite';
  const keyPath = getForm16VertexKeyPath();
  if (!keyPath || !fs.existsSync(keyPath)) {
    logger.warn('[Form16 OCR] Vertex: no service account key file found. Set GCP_KEY_FILE or GOOGLE_APPLICATION_CREDENTIALS.');
    return await fallbackExtraction(filePath);
  }
  const vertexAI = new VertexAI({
    project: projectId,
    location,
    googleAuthOptions: { keyFilename: keyPath },
  });
  const generativeModel = vertexAI.getGenerativeModel({
    model: modelId,
    generationConfig: { temperature: 0.1, topP: 0.95, topK: 40, maxOutputTokens: 8192 },
  });
  logger.info(`[Form16 OCR] Using Vertex AI (${modelId}, ${location}) for ${path.basename(filePath)}`);
  const request = {
    contents: [
      {
        role: 'user',
        parts: [
          { text: GEMINI_PROMPT },
          { inlineData: { mimeType, data: fileBase64 } },
        ],
      },
    ],
  };
  const response = await generativeModel.generateContent(request);
  const candidate = response.response?.candidates?.[0];
  const textPart = candidate?.content?.parts?.[0];
  const text = textPart && 'text' in textPart ? (textPart as { text: string }).text : '';
  if (!text || !text.trim()) {
    logger.warn('[Form16 OCR] Vertex AI returned no text, using fallback');
    return await fallbackExtraction(filePath);
  }
  let extractedData: Record<string, unknown>;
  try {
    const cleaned = text.trim().replace(/```json\s*/g, '').replace(/```\s*/g, '').trim();
    const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
    if (jsonMatch) extractedData = JSON.parse(jsonMatch[0]) as Record<string, unknown>;
    else extractedData = JSON.parse(cleaned) as Record<string, unknown>;
  } catch (parseErr) {
    logger.warn('[Form16 OCR] Failed to parse Vertex AI JSON, using fallback:', parseErr);
    return await fallbackExtraction(filePath);
  }
  const data = sanitizeAndCleanGeminiData(extractedData);
  logger.info('[Form16 OCR] Vertex AI extraction completed successfully');
  return {
    success: true,
    data,
    method: 'gemini',
    ocrProvider: 'Vertex AI (Gemini)',
  };
}

/**
 * Extract Form 16A details from PDF: (1) Gemini API key, (2) Vertex AI with service account, (3) regex fallback.
 */
export async function extractForm16ADetails(filePath: string): Promise<Form16OcrResult> {
  const fileBuffer = fs.readFileSync(filePath);
  const fileBase64 = fileBuffer.toString('base64');
  const ext = path.extname(filePath).toLowerCase();
  const mimeType = ext === '.pdf' ? 'application/pdf' : 'image/png';

  try {
    const geminiKey = process.env.GEMINI_API_KEY?.trim();
    if (geminiKey) {
      const genAI = new GoogleGenerativeAI(geminiKey);
      const modelId = process.env.GEMINI_MODEL || 'gemini-2.0-flash';
      const model = genAI.getGenerativeModel({
        model: modelId,
        generationConfig: { temperature: 0.1, topP: 0.95, topK: 40 },
      });
      logger.info(`[Form16 OCR] Using Gemini API (${modelId}) for ${path.basename(filePath)}`);
      const imagePart = { inlineData: { data: fileBase64, mimeType } };
      const result = await model.generateContent([GEMINI_PROMPT, imagePart]);
      const response = result.response;
      if (!response) {
        logger.warn('[Form16 OCR] Gemini API returned no response, trying Vertex AI or fallback');
        const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType);
        if (vertexResult.success) return vertexResult;
        return await fallbackExtraction(filePath);
      }
      let text: string;
      try {
        text = response.text();
      } catch (textErr) {
        logger.warn('[Form16 OCR] Gemini API response.text() failed, trying Vertex AI or fallback:', textErr);
        const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType);
        if (vertexResult.success) return vertexResult;
        return await fallbackExtraction(filePath);
      }
      if (!text || !text.trim()) {
        const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType);
        if (vertexResult.success) return vertexResult;
        return await fallbackExtraction(filePath);
      }
      let extractedData: Record<string, unknown>;
      try {
        const cleaned = text.trim().replace(/```json\s*/g, '').replace(/```\s*/g, '').trim();
        const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
        if (jsonMatch) extractedData = JSON.parse(jsonMatch[0]) as Record<string, unknown>;
        else extractedData = JSON.parse(cleaned) as Record<string, unknown>;
      } catch (parseErr) {
        logger.warn('[Form16 OCR] Failed to parse Gemini API JSON, using fallback:', parseErr);
        return await fallbackExtraction(filePath);
      }
      const data = sanitizeAndCleanGeminiData(extractedData);
      return {
        success: true,
        data,
        method: 'gemini',
        ocrProvider: 'Google Gemini API',
      };
    }

    // No API key: use Vertex AI with service account
    const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType);
    if (vertexResult.success) return vertexResult;
    logger.warn('[Form16 OCR] Vertex AI failed or unavailable, using regex fallback');
    return await fallbackExtraction(filePath);
  } catch (error: unknown) {
    logger.error('[Form16 OCR] Gemini/Vertex extraction error:', error);
    logger.info('[Form16 OCR] Falling back to regex-based extraction');
    return await fallbackExtraction(filePath);
  }
}