/** * Form 16A OCR: Google Gemini (primary) with regex fallback. * Supports (1) Gemini API key, (2) Vertex AI with service account, (3) regex fallback. */ import { GoogleGenerativeAI } from '@google/generative-ai'; import { VertexAI } from '@google-cloud/vertexai'; import * as fs from 'fs'; import * as path from 'path'; import dotenv from 'dotenv'; import logger from '../utils/logger'; // Ensure .env is loaded (backend may run from different cwd) const backendDir = path.join(__dirname, '../..'); dotenv.config({ path: path.join(backendDir, '.env') }); function getForm16VertexKeyPath(): string | null { const keyFile = process.env.GCP_KEY_FILE?.trim(); const creds = process.env.GOOGLE_APPLICATION_CREDENTIALS?.trim(); if (keyFile) return path.isAbsolute(keyFile) ? keyFile : path.resolve(backendDir, keyFile); if (creds) return path.isAbsolute(creds) ? creds : path.resolve(backendDir, creds); const defaultPath = path.join(backendDir, 'credentials', 're-platform-workflow-dealer-3d5738fcc1f9.json'); return fs.existsSync(defaultPath) ? defaultPath : null; } export interface Form16AExtractedData { nameAndAddressOfDeductor?: string | null; deductorName?: string | null; deductorAddress?: string | null; deductorPhone?: string | null; deductorEmail?: string | null; totalAmountPaid?: number | null; totalTaxDeducted?: number | null; totalTdsDeposited?: number | null; tanOfDeductor?: string | null; natureOfPayment?: string | null; transactionDate?: string | null; statusOfMatchingOltas?: string | null; dateOfBooking?: string | null; assessmentYear?: string | null; quarter?: string | null; form16aNumber?: string | null; financialYear?: string | null; certificateDate?: string | null; tanNumber?: string | null; tdsAmount?: number | null; totalAmount?: number | null; } export interface Form16OcrResult { success: boolean; data?: Form16AExtractedData; method?: 'gemini' | 'fallback'; ocrProvider?: string; error?: string; message?: string; } const GEMINI_PROMPT = `You are an expert at extracting data from Indian Tax Form 16A certificates (TDS certificate under Section 203). STEP 1 - Read the ENTIRE document: every table, every section, and every line. Form 16A has multiple parts: deductor details, deductee details, and one or more TABLES with payment/TDS figures. STEP 2 - Extract these fields. For amounts, look in TABLES: find rows or columns with these labels and take the NUMBER in the same row/column (ignore ₹, Rs, commas): 1. nameAndAddressOfDeductor - "Name and address of the deductor". Full block in one string. Also extract: deductorName (person/entity name only), deductorAddress (street, city, state, PIN), deductorPhone, deductorEmail. 2. totalAmountPaid - In "Summary of payment" find "Total(Rs)" or "Amount paid/credited". The LARGE amount (e.g. 181968556.36). Not the TDS amount. 3. totalTaxDeducted - "Amount of Tax Deducted in respect of Deductee" in tax summary table. The TDS amount (e.g. 181969.00). Must be hundreds or more, NOT a single digit like 3. 4. totalTdsDeposited - "Amount of Tax Deposited / Remitted in respect of Deductee". Same as totalTaxDeducted if one total (e.g. 181969.00). NOT a page number. 5. tanOfDeductor - "TAN of the deductor" or "TAN". Must be exactly 10 characters: 4 uppercase letters + 5 digits + 1 letter (e.g. BLRH07660C). No spaces. 6. natureOfPayment - "Nature of payment" or "Section" or "Nature of Payment". Value is usually a section code like 194Q, 194A, 194I, or a short description. Extract that code or text. 7. transactionDate - "Transaction date" or "Date of payment" or "Period" end date. Format DD-MM-YYYY or DD/MM/YYYY. 8. statusOfMatchingOltas - "Status of matching with OLTAS" or "OLTAS". Single letter (F, O, M) or word like "Matched". Extract as shown. 9. dateOfBooking - "Date of booking" or "Date of deposit". DD-MM-YYYY or DD/MM/YYYY. 10. assessmentYear - "Assessment Year" or "AY" from the form header. Format YYYY-YY (e.g. 2025-26). This is the Form 16A assessment year. 11. quarter - "Quarter". Must be Q1, Q2, Q3, or Q4. If you see "Apr-Jun","Jul-Sep","Oct-Dec","Jan-Mar" or "Quarter 1" etc., convert to Q1, Q2, Q3, Q4. 12. form16aNumber - "Certificate Number" or "Certificate No" - the alphanumeric code (e.g. LTZKJZA, 12345). Do NOT return "Last" or "updated" or "on" (from "Last updated on"). Only the certificate ID. If unclear, return null. 13. financialYear - "Financial Year" or "FY". Format YYYY-YY. Can derive from Assessment Year (AY 2025-26 => FY 2024-25). 14. certificateDate - "Date of certificate" or "Last updated on". DD-MM-YYYY. Optional. RULES: - Scan every table in the document for amount and TDS figures. The totals are usually in the last row or a row labeled "Total". - For amounts: output only a number (e.g. 128234.00), no currency symbol or commas. - For form16aNumber: if the value is a single English word (e.g. contains, certificate, number, nil), return null. - If a field is truly not in the document, set it to null. - Return ONLY a single JSON object, no markdown, no \`\`\`, no explanation. JSON format (use this exact structure): { "nameAndAddressOfDeductor": "string or null", "deductorName": "string or null", "deductorAddress": "string or null", "deductorPhone": "string or null", "deductorEmail": "string or null", "totalAmountPaid": number or null, "totalTaxDeducted": number or null, "totalTdsDeposited": number or null, "tanOfDeductor": "string or null", "natureOfPayment": "string or null", "transactionDate": "string or null", "statusOfMatchingOltas": "string or null", "dateOfBooking": "string or null", "assessmentYear": "string or null", "quarter": "string or null", "form16aNumber": "string or null", "financialYear": "string or null", "certificateDate": "string or null" }`; // ----- Helpers (aligned with REform16) ----- function getNum(v: unknown): number | null { if (v == null || v === '') return null; if (typeof v === 'number' && !Number.isNaN(v)) return v; const s = String(v).replace(/,/g, '').replace(/₹|Rs\.?|INR/gi, '').trim(); const n = parseFloat(s); return !Number.isNaN(n) ? n : null; } function getStr(v: unknown): string | null { if (v != null && String(v).trim() !== '') return String(v).trim(); return null; } function parseDeductorBlock(block: string | null): { name: string | null; address: string | null; phone: string | null; email: string | null } { const result = { name: null as string | null, address: null as string | null, phone: null as string | null, email: null as string | null }; if (!block || typeof block !== 'string') return result; const parts = block.split(/[,]+/).map((p) => p.trim()).filter(Boolean); const emailPart = parts.find((p) => /@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/.test(p)); const phonePart = parts.find((p) => /^[\+]?[(]?[0-9\s\-()]{10,}$/.test(p) || /^\+?91[\s\-]?\d{10}$/.test(p)); if (emailPart) { result.email = emailPart; parts.splice(parts.indexOf(emailPart), 1); } if (phonePart) { result.phone = phonePart; parts.splice(parts.indexOf(phonePart), 1); } if (parts.length > 0) { result.name = parts[0]; if (parts.length > 1) result.address = parts.slice(1).join(', '); } return result; } function isValidTdsAmount(n: number | null): boolean { if (n == null || Number.isNaN(n) || n < 0) return false; if (n >= 100) return true; if (Number.isInteger(n) && n < 100) return false; return true; } function extractTotalAmountPaidForm16A(text: string): number | null { const m = text.match(/Summary\s+of\s+payment[\s\S]*?Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i); if (m?.[1]) { const n = parseFloat(m[1].replace(/,/g, '')); if (!Number.isNaN(n) && n > 0) return n; } const m2 = text.match(/Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i); if (m2?.[1]) { const n = parseFloat(m2[1].replace(/,/g, '')); if (!Number.isNaN(n) && n > 0) return n; } return null; } function extractTDSAmountsForm16A(text: string): { taxDeducted: number | null; taxDeposited: number | null; totalRs: number | null } { const result = { taxDeducted: null as number | null, taxDeposited: null as number | null, totalRs: null as number | null }; const quarterLine = text.match(/Q[1-4][A-Z0-9]*([0-9,]+\.\d{2})([0-9,]+\.\d{2})/); if (quarterLine?.[1] && quarterLine?.[2]) { const a1 = parseFloat(quarterLine[1].replace(/,/g, '')); const a2 = parseFloat(quarterLine[2].replace(/,/g, '')); if (isValidTdsAmount(a1)) result.taxDeducted = a1; if (isValidTdsAmount(a2)) result.taxDeposited = a2; } const taxDeductedM = text.match(/Amount\s+of\s+Tax\s+Deducted[\s\S]*?([0-9,]{3,}\.?[0-9]*|[0-9,]+\.\d{2})\s*(?:Amount|Deductee|$)/i); if (taxDeductedM?.[1] && !result.taxDeducted) { const n = parseFloat(taxDeductedM[1].replace(/,/g, '')); if (isValidTdsAmount(n)) result.taxDeducted = n; } const taxDepositedM = text.match(/Amount\s+of\s+Tax\s+Deposited[\s\S]*?([0-9,]{3,}\.?[0-9]*|[0-9,]+\.\d{2})/i); if (taxDepositedM?.[1] && !result.taxDeposited) { const n = parseFloat(taxDepositedM[1].replace(/,/g, '')); if (isValidTdsAmount(n)) result.taxDeposited = n; } const totalRsM = text.match(/Status\s+of\s+matching[\s\S]*?Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i) || text.match(/OLTAS[\s\S]*?Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i); if (totalRsM?.[1]) { const n = parseFloat(totalRsM[1].replace(/,/g, '')); if (isValidTdsAmount(n)) result.totalRs = n; } const rsDeductedM = text.match(/sum\s+of\s+Rs\.?\s*([0-9,]+\.?[0-9]*)\s*\[?Rs\.?\s*[Oo]ne\s+[Ll]akh/i); if (rsDeductedM?.[1] && !result.taxDeducted) { const n = parseFloat(rsDeductedM[1].replace(/,/g, '')); if (isValidTdsAmount(n)) result.taxDeducted = n; } const rsDepositedM = text.match(/deposited\s+[a-z\s]+Rs\.?\s*([0-9,]+\.?[0-9]*)\s*\[?Rs\.?\s*[Oo]ne/i); if (rsDepositedM?.[1] && !result.taxDeposited) { const n = parseFloat(rsDepositedM[1].replace(/,/g, '')); if (isValidTdsAmount(n)) result.taxDeposited = n; } if (result.taxDeducted != null && !isValidTdsAmount(result.taxDeducted)) result.taxDeducted = null; if (result.taxDeposited != null && !isValidTdsAmount(result.taxDeposited)) result.taxDeposited = null; if (result.totalRs != null && !isValidTdsAmount(result.totalRs)) result.totalRs = null; return result; } function extractQuarterForm16A(text: string): string | null { const m = text.match(/\bQuarter\s*[:\s]*\n?\s*(Q[1-4])/i) || text.match(/\b(Q[1-4])[A-Z0-9]*\s*[0-9]/); if (m?.[1]) return m[1].toUpperCase(); return extractQuarter(text); } function extractQuarter(text: string): string | null { const patterns = [ /Quarter[:\s]*([1-4])/i, /Q[:\s]*([1-4])/i, /([1-4])\s*Quarter/i, ]; for (const pattern of patterns) { const m = text.match(pattern); if (m?.[1]) return `Q${m[1]}`; } const dateMatch = text.match(/(Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|September|Oct|October|Nov|November|Dec|December|Jan|January|Feb|February|Mar|March)/i); if (dateMatch) { const month = dateMatch[1].toLowerCase(); if (['apr', 'april', 'may', 'jun', 'june'].includes(month)) return 'Q1'; if (['jul', 'july', 'aug', 'august', 'sep', 'september'].includes(month)) return 'Q2'; if (['oct', 'october', 'nov', 'november', 'dec', 'december'].includes(month)) return 'Q3'; if (['jan', 'january', 'feb', 'february', 'mar', 'march'].includes(month)) return 'Q4'; } return null; } function extractNatureOfPayment(text: string): string | null { const m = text.match(/\.(\d{2})\s*(19[4-9][A-Z]?|20[0-6][A-Z]?)\s*\d{2}-/); if (m?.[2]) return m[2]; const m2 = text.match(/Nature\s+of\s+payment[\s\S]*?(19[4-9][A-Z]?|20[0-6][A-Z]?)/i); if (m2?.[1]) return m2[1]; const m3 = text.match(/\b(19[4-9][A-Z]|20[0-6][A-Z])\b/); return m3?.[1] ?? null; } function extractTransactionDate(text: string): string | null { const m = text.match(/Period\s*From\s*(\d{1,2}-[A-Za-z]{3}-\d{4})\s*To\s*(\d{1,2}-[A-Za-z]{3}-\d{4})/i); if (m?.[2]) return m[2]; const d = text.match(/(\d{1,2}-\d{1,2}-\d{4})/g); return d?.[0] ?? null; } function extractOltasStatus(text: string): string | null { const m = text.match(/Status\s+of\s+matching\s+with\s+OLTAS[\s\S]*?(\d{2}-\d{2}-\d{4})\s*(\d+)\s*([FOMUP])/i) || text.match(/([FOMUP])\s*Final\s*|([FOMUP])\s*Unmatched/i) || text.match(/\d{2}-\d{2}-\d{4}\s*\d+\s*([FOMUP])/); if (m) return (m[3] || m[1] || m[2] || '').toUpperCase(); return null; } function extractDateOfBooking(text: string): string | null { const m = text.match(/Date\s+on\s+which\s+tax\s+deposited[\s\S]*?(\d{1,2}-\d{1,2}-\d{4})/i) || text.match(/Challan[\s\S]*?(\d{1,2}-\d{1,2}-\d{4})/i); return m?.[1] ?? null; } function extractForm16ANumber(text: string): string | null { const invalidWords = ['contains', 'certificate', 'number', 'nil', 'na', 'n/a', 'none', 'last']; const patterns = [ /Certificate\s*No\.?[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i, /Form\s*16A\s*No\.?[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i, /Form\s*16A[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i, /Certificate\s*number[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i, ]; for (const pattern of patterns) { const match = text.match(pattern); if (match?.[1]) { const val = match[1].trim(); if (invalidWords.includes(val.toLowerCase()) || val.length < 3) continue; if (/\d/.test(val)) return val; if (/^[A-Z0-9\-]{3,30}$/i.test(val)) return val; } } return null; } function extractTAN(text: string): string | null { const patterns = [ /TAN[:\s]*([A-Z]{4}[0-9]{5}[A-Z]{1})/i, /Tax\s*Deduction\s*Account\s*Number[:\s]*([A-Z]{4}[0-9]{5}[A-Z]{1})/i, /([A-Z]{4}[0-9]{5}[A-Z]{1})/g, ]; for (const pattern of patterns) { const match = text.match(pattern); if (match?.[1]) return match[1].trim().toUpperCase(); } return null; } function extractDeductorName(text: string): string | null { const patterns = [ /Deductor[:\s]*([A-Z][A-Za-z\s&.,]+)/i, /Name\s*of\s*Deductor[:\s]*([A-Z][A-Za-z\s&.,]+)/i, /Company\s*Name[:\s]*([A-Z][A-Za-z\s&.,]+)/i, ]; for (const pattern of patterns) { const match = text.match(pattern); if (match?.[1]) return match[1].trim(); } return null; } function extractFinancialYear(text: string): string | null { const patterns = [ /Financial\s*Year[:\s]*([0-9]{4}[-/][0-9]{2,4})/i, /FY[:\s]*([0-9]{4}[-/][0-9]{2,4})/i, /([0-9]{4}[-/][0-9]{2,4})/, ]; for (const pattern of patterns) { const match = text.match(pattern); if (match?.[1]) return match[1].trim(); } return null; } function extractAssessmentYear(text: string): string | null { const fyMatch = extractFinancialYear(text); if (fyMatch) { const parts = fyMatch.split(/[-/]/); if (parts.length === 2) { const startYear = parseInt(parts[0], 10); return `${startYear + 1}-${(startYear + 2).toString().slice(-2)}`; } } const patterns = [ /Assessment\s*Year[:\s]*([0-9]{4}[-/][0-9]{2,4})/i, /AY[:\s]*([0-9]{4}[-/][0-9]{2,4})/i, ]; for (const pattern of patterns) { const match = text.match(pattern); if (match?.[1]) return match[1].trim(); } return null; } function extractCertificateDate(text: string): string | null { const patterns = [ /Certificate\s*Date[:\s]*([0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{4})/i, /Date[:\s]*([0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{4})/i, /Issued\s*on[:\s]*([0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{4})/i, ]; for (const pattern of patterns) { const match = text.match(pattern); if (match?.[1]) return match[1].trim(); } return null; } /** Parse Form 16A raw text (REform16-aligned). */ function parseForm16ARawText(text: string): Form16AExtractedData { const lines = text.split(/\r?\n/).map((l) => l.trim()).filter(Boolean); const fullText = lines.join('\n'); let nameAndAddressOfDeductor: string | null = null; const deductorStart = fullText.search(/Name\s+and\s+address\s+of\s+the\s+deductor/i); const deductorEnd = fullText.search(/Name\s+and\s+address\s+of\s+the\s+deductee|PAN\s+of\s+the\s+deductor/i); if (deductorStart !== -1 && deductorEnd !== -1 && deductorEnd > deductorStart) { const block = fullText.slice(deductorStart, deductorEnd); const afterLabel = block.replace(/Name\s+and\s+address\s+of\s+the\s+deductor\s*/i, '').trim(); nameAndAddressOfDeductor = afterLabel.split(/\n/).map((l) => l.trim()).filter(Boolean).join(', ') || null; } if (!nameAndAddressOfDeductor) nameAndAddressOfDeductor = extractDeductorName(fullText); const tanOfDeductor = extractTAN(fullText); const totalAmountPaid = extractTotalAmountPaidForm16A(fullText); const tdsAmounts = extractTDSAmountsForm16A(fullText); const totalTaxDeducted = tdsAmounts.taxDeducted ?? tdsAmounts.totalRs ?? null; const totalTdsDeposited = tdsAmounts.taxDeposited ?? tdsAmounts.totalRs ?? totalTaxDeducted ?? null; const form16aNumber = extractForm16ANumber(fullText); const assessmentYear = extractAssessmentYear(fullText); const quarter = extractQuarterForm16A(fullText); const natureOfPayment = extractNatureOfPayment(fullText); const transactionDate = extractTransactionDate(fullText); const statusOfMatchingOltas = extractOltasStatus(fullText); const certificateDate = extractCertificateDate(fullText); const dateOfBooking = extractDateOfBooking(fullText); let financialYear = extractFinancialYear(fullText); if (!financialYear && assessmentYear) { const parts = assessmentYear.split(/[-/]/).map((p) => parseInt(p, 10)); if (parts.length === 2 && !Number.isNaN(parts[1])) { financialYear = `${parts[0] - 1}-${String(parts[1] - 1).padStart(2, '0')}`; } } const parsedDeductor = parseDeductorBlock(nameAndAddressOfDeductor || ''); return { nameAndAddressOfDeductor, deductorName: parsedDeductor.name || nameAndAddressOfDeductor, deductorAddress: parsedDeductor.address ?? null, deductorPhone: parsedDeductor.phone ?? null, deductorEmail: parsedDeductor.email ?? null, totalAmountPaid: totalAmountPaid ?? null, totalTaxDeducted: totalTaxDeducted ?? null, totalTdsDeposited: totalTdsDeposited ?? null, tanOfDeductor, natureOfPayment: natureOfPayment ?? null, transactionDate: transactionDate ?? null, statusOfMatchingOltas: statusOfMatchingOltas ?? null, dateOfBooking: dateOfBooking ?? null, assessmentYear: assessmentYear ?? null, quarter: quarter ?? null, form16aNumber, financialYear: financialYear ?? null, certificateDate, tanNumber: tanOfDeductor, tdsAmount: totalTaxDeducted ?? null, totalAmount: totalAmountPaid ?? null, }; } /** Fallback: pdf-parse (v2 PDFParse API) + parseForm16ARawText (REform16-aligned). */ async function fallbackExtraction(filePath: string): Promise { try { const dataBuffer = fs.readFileSync(filePath); const { PDFParse } = await import('pdf-parse'); const parser = new PDFParse({ data: new Uint8Array(dataBuffer) }); const textResult = await parser.getText(); const text = textResult?.text ?? ''; await parser.destroy(); if (!text || typeof text !== 'string') { logger.warn('[Form16 OCR] Fallback: no text extracted from PDF'); return { success: false, message: 'No text could be extracted from PDF', error: 'Empty PDF text' }; } const extracted = parseForm16ARawText(text); if (extracted.tanOfDeductor && !/^[A-Z]{4}[0-9]{5}[A-Z]{1}$/.test(extracted.tanOfDeductor)) { extracted.tanOfDeductor = null; extracted.tanNumber = null; } if (extracted.quarter) { const q = extracted.quarter.toUpperCase().trim(); const qMatch = q.match(/[Q]?([1-4])/); extracted.quarter = /^Q[1-4]$/.test(q) ? q : (qMatch ? `Q${qMatch[1]}` : null); } logger.info('[Form16 OCR] Fallback extraction completed'); return { success: true, data: extracted, method: 'fallback', ocrProvider: 'Regex fallback', }; } catch (error: unknown) { const errMsg = error instanceof Error ? error.message : String(error); logger.error('[Form16 OCR] Fallback extraction error:', error); return { success: false, error: errMsg, message: 'Failed to extract data from PDF', }; } } function sanitizeAndCleanGeminiData(extracted: Record): Form16AExtractedData { const invalidCertWords = ['contains', 'certificate', 'number', 'nil', 'na', 'n/a', 'none', 'not', 'mentioned', 'see', 'above', 'below', 'refer', 'document', 'form', 'the', 'a', 'an', 'last']; const isInvalidCertNumber = (s: string | null): boolean => { if (!s || s.length > 50) return true; const lower = s.toLowerCase(); if (invalidCertWords.some((w) => lower === w || lower.startsWith(w + ' ') || lower.endsWith(' ' + w))) return true; if (/\d/.test(s)) return false; if (/^[A-Z0-9\-]{3,30}$/i.test(s)) return false; return true; }; const rawCertNo = getStr(extracted.form16aNumber); const form16aNumber = rawCertNo && !isInvalidCertNumber(rawCertNo) ? rawCertNo : null; const sanitizeTds = (n: number | null): number | null => { if (n == null || n < 0) return null; if (n >= 100) return n; if (Number.isInteger(n) && n < 100) return null; return n; }; const rawTdsDeducted = getNum(extracted.totalTaxDeducted ?? extracted.tdsAmount); const rawTdsDeposited = getNum(extracted.totalTdsDeposited ?? extracted.tdsAmount); const safeTdsDeducted = sanitizeTds(rawTdsDeducted); const safeTdsDeposited = sanitizeTds(rawTdsDeposited); const deductorBlock = getStr(extracted.nameAndAddressOfDeductor ?? extracted.deductorName); const parsedDeductor = parseDeductorBlock(deductorBlock); const tanStr = getStr(extracted.tanOfDeductor ?? extracted.tanNumber); let tanUpper: string | null = tanStr ? tanStr.toUpperCase().trim() : null; if (tanUpper && !/^[A-Z]{4}[0-9]{5}[A-Z]{1}$/.test(tanUpper)) { tanUpper = null; } const quarterRaw = getStr(extracted.quarter); let quarter: string | null = null; if (quarterRaw) { const q = quarterRaw.toUpperCase().trim(); if (/^Q[1-4]$/.test(q)) quarter = q; else { const m = q.match(/[Q]?([1-4])/); if (m) quarter = `Q${m[1]}`; } } return { nameAndAddressOfDeductor: deductorBlock, deductorName: getStr(extracted.deductorName ?? parsedDeductor.name) || deductorBlock, deductorAddress: getStr(extracted.deductorAddress ?? parsedDeductor.address), deductorPhone: getStr(extracted.deductorPhone ?? parsedDeductor.phone), deductorEmail: getStr(extracted.deductorEmail ?? parsedDeductor.email), totalAmountPaid: getNum(extracted.totalAmountPaid ?? extracted.totalAmount), totalTaxDeducted: safeTdsDeducted, totalTdsDeposited: safeTdsDeposited, tanOfDeductor: tanUpper, natureOfPayment: getStr(extracted.natureOfPayment), transactionDate: getStr(extracted.transactionDate), statusOfMatchingOltas: getStr(extracted.statusOfMatchingOltas), dateOfBooking: getStr(extracted.dateOfBooking), assessmentYear: getStr(extracted.assessmentYear), quarter, form16aNumber, financialYear: getStr(extracted.financialYear), certificateDate: getStr(extracted.certificateDate), tanNumber: tanUpper, tdsAmount: safeTdsDeducted, totalAmount: getNum(extracted.totalAmountPaid ?? extracted.totalAmount), }; } /** Run Form 16A extraction via Vertex AI (service account). */ async function extractWithVertexAI(filePath: string, fileBase64: string, mimeType: string): Promise { const projectId = process.env.GCP_PROJECT_ID?.trim() || 're-platform-workflow-dealer'; const location = process.env.FORM16_VERTEX_LOCATION?.trim() || process.env.VERTEX_AI_LOCATION?.trim() || 'us-central1'; const modelId = process.env.GEMINI_MODEL?.trim() || 'gemini-2.0-flash-lite'; const keyPath = getForm16VertexKeyPath(); if (!keyPath || !fs.existsSync(keyPath)) { logger.warn('[Form16 OCR] Vertex: no service account key file found. Set GCP_KEY_FILE or GOOGLE_APPLICATION_CREDENTIALS.'); return await fallbackExtraction(filePath); } const vertexAI = new VertexAI({ project: projectId, location, googleAuthOptions: { keyFilename: keyPath }, }); const generativeModel = vertexAI.getGenerativeModel({ model: modelId, generationConfig: { temperature: 0.1, topP: 0.95, topK: 40, maxOutputTokens: 8192 }, }); logger.info(`[Form16 OCR] Using Vertex AI (${modelId}, ${location}) for ${path.basename(filePath)}`); const request = { contents: [ { role: 'user', parts: [ { text: GEMINI_PROMPT }, { inlineData: { mimeType, data: fileBase64 } }, ], }, ], }; const response = await generativeModel.generateContent(request); const candidate = response.response?.candidates?.[0]; const textPart = candidate?.content?.parts?.[0]; const text = textPart && 'text' in textPart ? (textPart as { text: string }).text : ''; if (!text || !text.trim()) { logger.warn('[Form16 OCR] Vertex AI returned no text, using fallback'); return await fallbackExtraction(filePath); } let extractedData: Record; try { const cleaned = text.trim().replace(/```json\s*/g, '').replace(/```\s*/g, '').trim(); const jsonMatch = cleaned.match(/\{[\s\S]*\}/); if (jsonMatch) extractedData = JSON.parse(jsonMatch[0]) as Record; else extractedData = JSON.parse(cleaned) as Record; } catch (parseErr) { logger.warn('[Form16 OCR] Failed to parse Vertex AI JSON, using fallback:', parseErr); return await fallbackExtraction(filePath); } const data = sanitizeAndCleanGeminiData(extractedData); logger.info('[Form16 OCR] Vertex AI extraction completed successfully'); return { success: true, data, method: 'gemini', ocrProvider: 'Vertex AI (Gemini)', }; } /** * Extract Form 16A details from PDF: (1) Gemini API key, (2) Vertex AI with service account, (3) regex fallback. */ export async function extractForm16ADetails(filePath: string): Promise { const fileBuffer = fs.readFileSync(filePath); const fileBase64 = fileBuffer.toString('base64'); const ext = path.extname(filePath).toLowerCase(); const mimeType = ext === '.pdf' ? 'application/pdf' : 'image/png'; try { const geminiKey = process.env.GEMINI_API_KEY?.trim(); if (geminiKey) { const genAI = new GoogleGenerativeAI(geminiKey); const modelId = process.env.GEMINI_MODEL || 'gemini-2.0-flash'; const model = genAI.getGenerativeModel({ model: modelId, generationConfig: { temperature: 0.1, topP: 0.95, topK: 40 }, }); logger.info(`[Form16 OCR] Using Gemini API (${modelId}) for ${path.basename(filePath)}`); const imagePart = { inlineData: { data: fileBase64, mimeType } }; const result = await model.generateContent([GEMINI_PROMPT, imagePart]); const response = result.response; if (!response) { logger.warn('[Form16 OCR] Gemini API returned no response, trying Vertex AI or fallback'); const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType); if (vertexResult.success) return vertexResult; return await fallbackExtraction(filePath); } let text: string; try { text = response.text(); } catch (textErr) { logger.warn('[Form16 OCR] Gemini API response.text() failed, trying Vertex AI or fallback:', textErr); const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType); if (vertexResult.success) return vertexResult; return await fallbackExtraction(filePath); } if (!text || !text.trim()) { const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType); if (vertexResult.success) return vertexResult; return await fallbackExtraction(filePath); } let extractedData: Record; try { const cleaned = text.trim().replace(/```json\s*/g, '').replace(/```\s*/g, '').trim(); const jsonMatch = cleaned.match(/\{[\s\S]*\}/); if (jsonMatch) extractedData = JSON.parse(jsonMatch[0]) as Record; else extractedData = JSON.parse(cleaned) as Record; } catch (parseErr) { logger.warn('[Form16 OCR] Failed to parse Gemini API JSON, using fallback:', parseErr); return await fallbackExtraction(filePath); } const data = sanitizeAndCleanGeminiData(extractedData); return { success: true, data, method: 'gemini', ocrProvider: 'Google Gemini API', }; } // No API key: use Vertex AI with service account const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType); if (vertexResult.success) return vertexResult; logger.warn('[Form16 OCR] Vertex AI failed or unavailable, using regex fallback'); return await fallbackExtraction(filePath); } catch (error: unknown) { logger.error('[Form16 OCR] Gemini/Vertex extraction error:', error); logger.info('[Form16 OCR] Falling back to regex-based extraction'); return await fallbackExtraction(filePath); } }