Re_Backend/src/services/form16Ocr.service.ts

670 lines
29 KiB
TypeScript

/**
* Form 16A OCR: Google Gemini (primary) with regex fallback.
* Supports (1) Gemini API key, (2) Vertex AI with service account, (3) regex fallback.
*/
import { GoogleGenerativeAI } from '@google/generative-ai';
import { VertexAI } from '@google-cloud/vertexai';
import * as fs from 'fs';
import * as path from 'path';
import dotenv from 'dotenv';
import logger from '../utils/logger';
// Ensure .env is loaded (backend may run from different cwd)
const backendDir = path.join(__dirname, '../..');
dotenv.config({ path: path.join(backendDir, '.env') });
function getForm16VertexKeyPath(): string | null {
const keyFile = process.env.GCP_KEY_FILE?.trim();
const creds = process.env.GOOGLE_APPLICATION_CREDENTIALS?.trim();
if (keyFile) return path.isAbsolute(keyFile) ? keyFile : path.resolve(backendDir, keyFile);
if (creds) return path.isAbsolute(creds) ? creds : path.resolve(backendDir, creds);
const defaultPath = path.join(backendDir, 'credentials', 're-platform-workflow-dealer-3d5738fcc1f9.json');
return fs.existsSync(defaultPath) ? defaultPath : null;
}
export interface Form16AExtractedData {
nameAndAddressOfDeductor?: string | null;
deductorName?: string | null;
deductorAddress?: string | null;
deductorPhone?: string | null;
deductorEmail?: string | null;
totalAmountPaid?: number | null;
totalTaxDeducted?: number | null;
totalTdsDeposited?: number | null;
tanOfDeductor?: string | null;
natureOfPayment?: string | null;
transactionDate?: string | null;
statusOfMatchingOltas?: string | null;
dateOfBooking?: string | null;
assessmentYear?: string | null;
quarter?: string | null;
form16aNumber?: string | null;
financialYear?: string | null;
certificateDate?: string | null;
tanNumber?: string | null;
tdsAmount?: number | null;
totalAmount?: number | null;
}
export interface Form16OcrResult {
success: boolean;
data?: Form16AExtractedData;
method?: 'gemini' | 'fallback';
ocrProvider?: string;
error?: string;
message?: string;
}
const GEMINI_PROMPT = `You are an expert at extracting data from Indian Tax Form 16A certificates (TDS certificate under Section 203).
STEP 1 - Read the ENTIRE document: every table, every section, and every line. Form 16A has multiple parts: deductor details, deductee details, and one or more TABLES with payment/TDS figures.
STEP 2 - Extract these fields. For amounts, look in TABLES: find rows or columns with these labels and take the NUMBER in the same row/column (ignore ₹, Rs, commas):
1. nameAndAddressOfDeductor - "Name and address of the deductor". Full block in one string. Also extract: deductorName (person/entity name only), deductorAddress (street, city, state, PIN), deductorPhone, deductorEmail.
2. totalAmountPaid - In "Summary of payment" find "Total(Rs)" or "Amount paid/credited". The LARGE amount (e.g. 181968556.36). Not the TDS amount.
3. totalTaxDeducted - "Amount of Tax Deducted in respect of Deductee" in tax summary table. The TDS amount (e.g. 181969.00). Must be hundreds or more, NOT a single digit like 3.
4. totalTdsDeposited - "Amount of Tax Deposited / Remitted in respect of Deductee". Same as totalTaxDeducted if one total (e.g. 181969.00). NOT a page number.
5. tanOfDeductor - "TAN of the deductor" or "TAN". Must be exactly 10 characters: 4 uppercase letters + 5 digits + 1 letter (e.g. BLRH07660C). No spaces.
6. natureOfPayment - "Nature of payment" or "Section" or "Nature of Payment". Value is usually a section code like 194Q, 194A, 194I, or a short description. Extract that code or text.
7. transactionDate - "Transaction date" or "Date of payment" or "Period" end date. Format DD-MM-YYYY or DD/MM/YYYY.
8. statusOfMatchingOltas - "Status of matching with OLTAS" or "OLTAS". Single letter (F, O, M) or word like "Matched". Extract as shown.
9. dateOfBooking - "Date of booking" or "Date of deposit". DD-MM-YYYY or DD/MM/YYYY.
10. assessmentYear - "Assessment Year" or "AY" from the form header. Format YYYY-YY (e.g. 2025-26). This is the Form 16A assessment year.
11. quarter - "Quarter". Must be Q1, Q2, Q3, or Q4. If you see "Apr-Jun","Jul-Sep","Oct-Dec","Jan-Mar" or "Quarter 1" etc., convert to Q1, Q2, Q3, Q4.
12. form16aNumber - "Certificate Number" or "Certificate No" - the alphanumeric code (e.g. LTZKJZA, 12345). Do NOT return "Last" or "updated" or "on" (from "Last updated on"). Only the certificate ID. If unclear, return null.
13. financialYear - "Financial Year" or "FY". Format YYYY-YY. Can derive from Assessment Year (AY 2025-26 => FY 2024-25).
14. certificateDate - "Date of certificate" or "Last updated on". DD-MM-YYYY. Optional.
RULES:
- Scan every table in the document for amount and TDS figures. The totals are usually in the last row or a row labeled "Total".
- For amounts: output only a number (e.g. 128234.00), no currency symbol or commas.
- For form16aNumber: if the value is a single English word (e.g. contains, certificate, number, nil), return null.
- If a field is truly not in the document, set it to null.
- Return ONLY a single JSON object, no markdown, no \`\`\`, no explanation.
JSON format (use this exact structure):
{
"nameAndAddressOfDeductor": "string or null",
"deductorName": "string or null",
"deductorAddress": "string or null",
"deductorPhone": "string or null",
"deductorEmail": "string or null",
"totalAmountPaid": number or null,
"totalTaxDeducted": number or null,
"totalTdsDeposited": number or null,
"tanOfDeductor": "string or null",
"natureOfPayment": "string or null",
"transactionDate": "string or null",
"statusOfMatchingOltas": "string or null",
"dateOfBooking": "string or null",
"assessmentYear": "string or null",
"quarter": "string or null",
"form16aNumber": "string or null",
"financialYear": "string or null",
"certificateDate": "string or null"
}`;
// ----- Helpers (aligned with REform16) -----
function getNum(v: unknown): number | null {
if (v == null || v === '') return null;
if (typeof v === 'number' && !Number.isNaN(v)) return v;
const s = String(v).replace(/,/g, '').replace(/₹|Rs\.?|INR/gi, '').trim();
const n = parseFloat(s);
return !Number.isNaN(n) ? n : null;
}
function getStr(v: unknown): string | null {
if (v != null && String(v).trim() !== '') return String(v).trim();
return null;
}
function parseDeductorBlock(block: string | null): { name: string | null; address: string | null; phone: string | null; email: string | null } {
const result = { name: null as string | null, address: null as string | null, phone: null as string | null, email: null as string | null };
if (!block || typeof block !== 'string') return result;
const parts = block.split(/[,]+/).map((p) => p.trim()).filter(Boolean);
const emailPart = parts.find((p) => /@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/.test(p));
const phonePart = parts.find((p) => /^[\+]?[(]?[0-9\s\-()]{10,}$/.test(p) || /^\+?91[\s\-]?\d{10}$/.test(p));
if (emailPart) {
result.email = emailPart;
parts.splice(parts.indexOf(emailPart), 1);
}
if (phonePart) {
result.phone = phonePart;
parts.splice(parts.indexOf(phonePart), 1);
}
if (parts.length > 0) {
result.name = parts[0];
if (parts.length > 1) result.address = parts.slice(1).join(', ');
}
return result;
}
function isValidTdsAmount(n: number | null): boolean {
if (n == null || Number.isNaN(n) || n < 0) return false;
if (n >= 100) return true;
if (Number.isInteger(n) && n < 100) return false;
return true;
}
function extractTotalAmountPaidForm16A(text: string): number | null {
const m = text.match(/Summary\s+of\s+payment[\s\S]*?Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i);
if (m?.[1]) {
const n = parseFloat(m[1].replace(/,/g, ''));
if (!Number.isNaN(n) && n > 0) return n;
}
const m2 = text.match(/Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i);
if (m2?.[1]) {
const n = parseFloat(m2[1].replace(/,/g, ''));
if (!Number.isNaN(n) && n > 0) return n;
}
return null;
}
function extractTDSAmountsForm16A(text: string): { taxDeducted: number | null; taxDeposited: number | null; totalRs: number | null } {
const result = { taxDeducted: null as number | null, taxDeposited: null as number | null, totalRs: null as number | null };
const quarterLine = text.match(/Q[1-4][A-Z0-9]*([0-9,]+\.\d{2})([0-9,]+\.\d{2})/);
if (quarterLine?.[1] && quarterLine?.[2]) {
const a1 = parseFloat(quarterLine[1].replace(/,/g, ''));
const a2 = parseFloat(quarterLine[2].replace(/,/g, ''));
if (isValidTdsAmount(a1)) result.taxDeducted = a1;
if (isValidTdsAmount(a2)) result.taxDeposited = a2;
}
const taxDeductedM = text.match(/Amount\s+of\s+Tax\s+Deducted[\s\S]*?([0-9,]{3,}\.?[0-9]*|[0-9,]+\.\d{2})\s*(?:Amount|Deductee|$)/i);
if (taxDeductedM?.[1] && !result.taxDeducted) {
const n = parseFloat(taxDeductedM[1].replace(/,/g, ''));
if (isValidTdsAmount(n)) result.taxDeducted = n;
}
const taxDepositedM = text.match(/Amount\s+of\s+Tax\s+Deposited[\s\S]*?([0-9,]{3,}\.?[0-9]*|[0-9,]+\.\d{2})/i);
if (taxDepositedM?.[1] && !result.taxDeposited) {
const n = parseFloat(taxDepositedM[1].replace(/,/g, ''));
if (isValidTdsAmount(n)) result.taxDeposited = n;
}
const totalRsM = text.match(/Status\s+of\s+matching[\s\S]*?Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i)
|| text.match(/OLTAS[\s\S]*?Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i);
if (totalRsM?.[1]) {
const n = parseFloat(totalRsM[1].replace(/,/g, ''));
if (isValidTdsAmount(n)) result.totalRs = n;
}
const rsDeductedM = text.match(/sum\s+of\s+Rs\.?\s*([0-9,]+\.?[0-9]*)\s*\[?Rs\.?\s*[Oo]ne\s+[Ll]akh/i);
if (rsDeductedM?.[1] && !result.taxDeducted) {
const n = parseFloat(rsDeductedM[1].replace(/,/g, ''));
if (isValidTdsAmount(n)) result.taxDeducted = n;
}
const rsDepositedM = text.match(/deposited\s+[a-z\s]+Rs\.?\s*([0-9,]+\.?[0-9]*)\s*\[?Rs\.?\s*[Oo]ne/i);
if (rsDepositedM?.[1] && !result.taxDeposited) {
const n = parseFloat(rsDepositedM[1].replace(/,/g, ''));
if (isValidTdsAmount(n)) result.taxDeposited = n;
}
if (result.taxDeducted != null && !isValidTdsAmount(result.taxDeducted)) result.taxDeducted = null;
if (result.taxDeposited != null && !isValidTdsAmount(result.taxDeposited)) result.taxDeposited = null;
if (result.totalRs != null && !isValidTdsAmount(result.totalRs)) result.totalRs = null;
return result;
}
function extractQuarterForm16A(text: string): string | null {
const m = text.match(/\bQuarter\s*[:\s]*\n?\s*(Q[1-4])/i) || text.match(/\b(Q[1-4])[A-Z0-9]*\s*[0-9]/);
if (m?.[1]) return m[1].toUpperCase();
return extractQuarter(text);
}
function extractQuarter(text: string): string | null {
const patterns = [
/Quarter[:\s]*([1-4])/i,
/Q[:\s]*([1-4])/i,
/([1-4])\s*Quarter/i,
];
for (const pattern of patterns) {
const m = text.match(pattern);
if (m?.[1]) return `Q${m[1]}`;
}
const dateMatch = text.match(/(Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|September|Oct|October|Nov|November|Dec|December|Jan|January|Feb|February|Mar|March)/i);
if (dateMatch) {
const month = dateMatch[1].toLowerCase();
if (['apr', 'april', 'may', 'jun', 'june'].includes(month)) return 'Q1';
if (['jul', 'july', 'aug', 'august', 'sep', 'september'].includes(month)) return 'Q2';
if (['oct', 'october', 'nov', 'november', 'dec', 'december'].includes(month)) return 'Q3';
if (['jan', 'january', 'feb', 'february', 'mar', 'march'].includes(month)) return 'Q4';
}
return null;
}
function extractNatureOfPayment(text: string): string | null {
const m = text.match(/\.(\d{2})\s*(19[4-9][A-Z]?|20[0-6][A-Z]?)\s*\d{2}-/);
if (m?.[2]) return m[2];
const m2 = text.match(/Nature\s+of\s+payment[\s\S]*?(19[4-9][A-Z]?|20[0-6][A-Z]?)/i);
if (m2?.[1]) return m2[1];
const m3 = text.match(/\b(19[4-9][A-Z]|20[0-6][A-Z])\b/);
return m3?.[1] ?? null;
}
function extractTransactionDate(text: string): string | null {
const m = text.match(/Period\s*From\s*(\d{1,2}-[A-Za-z]{3}-\d{4})\s*To\s*(\d{1,2}-[A-Za-z]{3}-\d{4})/i);
if (m?.[2]) return m[2];
const d = text.match(/(\d{1,2}-\d{1,2}-\d{4})/g);
return d?.[0] ?? null;
}
function extractOltasStatus(text: string): string | null {
const m = text.match(/Status\s+of\s+matching\s+with\s+OLTAS[\s\S]*?(\d{2}-\d{2}-\d{4})\s*(\d+)\s*([FOMUP])/i)
|| text.match(/([FOMUP])\s*Final\s*|([FOMUP])\s*Unmatched/i)
|| text.match(/\d{2}-\d{2}-\d{4}\s*\d+\s*([FOMUP])/);
if (m) return (m[3] || m[1] || m[2] || '').toUpperCase();
return null;
}
function extractDateOfBooking(text: string): string | null {
const m = text.match(/Date\s+on\s+which\s+tax\s+deposited[\s\S]*?(\d{1,2}-\d{1,2}-\d{4})/i)
|| text.match(/Challan[\s\S]*?(\d{1,2}-\d{1,2}-\d{4})/i);
return m?.[1] ?? null;
}
function extractForm16ANumber(text: string): string | null {
const invalidWords = ['contains', 'certificate', 'number', 'nil', 'na', 'n/a', 'none', 'last'];
const patterns = [
/Certificate\s*No\.?[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i,
/Form\s*16A\s*No\.?[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i,
/Form\s*16A[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i,
/Certificate\s*number[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i,
];
for (const pattern of patterns) {
const match = text.match(pattern);
if (match?.[1]) {
const val = match[1].trim();
if (invalidWords.includes(val.toLowerCase()) || val.length < 3) continue;
if (/\d/.test(val)) return val;
if (/^[A-Z0-9\-]{3,30}$/i.test(val)) return val;
}
}
return null;
}
function extractTAN(text: string): string | null {
const patterns = [
/TAN[:\s]*([A-Z]{4}[0-9]{5}[A-Z]{1})/i,
/Tax\s*Deduction\s*Account\s*Number[:\s]*([A-Z]{4}[0-9]{5}[A-Z]{1})/i,
/([A-Z]{4}[0-9]{5}[A-Z]{1})/g,
];
for (const pattern of patterns) {
const match = text.match(pattern);
if (match?.[1]) return match[1].trim().toUpperCase();
}
return null;
}
function extractDeductorName(text: string): string | null {
const patterns = [
/Deductor[:\s]*([A-Z][A-Za-z\s&.,]+)/i,
/Name\s*of\s*Deductor[:\s]*([A-Z][A-Za-z\s&.,]+)/i,
/Company\s*Name[:\s]*([A-Z][A-Za-z\s&.,]+)/i,
];
for (const pattern of patterns) {
const match = text.match(pattern);
if (match?.[1]) return match[1].trim();
}
return null;
}
function extractFinancialYear(text: string): string | null {
const patterns = [
/Financial\s*Year[:\s]*([0-9]{4}[-/][0-9]{2,4})/i,
/FY[:\s]*([0-9]{4}[-/][0-9]{2,4})/i,
/([0-9]{4}[-/][0-9]{2,4})/,
];
for (const pattern of patterns) {
const match = text.match(pattern);
if (match?.[1]) return match[1].trim();
}
return null;
}
function extractAssessmentYear(text: string): string | null {
const fyMatch = extractFinancialYear(text);
if (fyMatch) {
const parts = fyMatch.split(/[-/]/);
if (parts.length === 2) {
const startYear = parseInt(parts[0], 10);
return `${startYear + 1}-${(startYear + 2).toString().slice(-2)}`;
}
}
const patterns = [
/Assessment\s*Year[:\s]*([0-9]{4}[-/][0-9]{2,4})/i,
/AY[:\s]*([0-9]{4}[-/][0-9]{2,4})/i,
];
for (const pattern of patterns) {
const match = text.match(pattern);
if (match?.[1]) return match[1].trim();
}
return null;
}
function extractCertificateDate(text: string): string | null {
const patterns = [
/Certificate\s*Date[:\s]*([0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{4})/i,
/Date[:\s]*([0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{4})/i,
/Issued\s*on[:\s]*([0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{4})/i,
];
for (const pattern of patterns) {
const match = text.match(pattern);
if (match?.[1]) return match[1].trim();
}
return null;
}
/** Parse Form 16A raw text (REform16-aligned). */
function parseForm16ARawText(text: string): Form16AExtractedData {
const lines = text.split(/\r?\n/).map((l) => l.trim()).filter(Boolean);
const fullText = lines.join('\n');
let nameAndAddressOfDeductor: string | null = null;
const deductorStart = fullText.search(/Name\s+and\s+address\s+of\s+the\s+deductor/i);
const deductorEnd = fullText.search(/Name\s+and\s+address\s+of\s+the\s+deductee|PAN\s+of\s+the\s+deductor/i);
if (deductorStart !== -1 && deductorEnd !== -1 && deductorEnd > deductorStart) {
const block = fullText.slice(deductorStart, deductorEnd);
const afterLabel = block.replace(/Name\s+and\s+address\s+of\s+the\s+deductor\s*/i, '').trim();
nameAndAddressOfDeductor = afterLabel.split(/\n/).map((l) => l.trim()).filter(Boolean).join(', ') || null;
}
if (!nameAndAddressOfDeductor) nameAndAddressOfDeductor = extractDeductorName(fullText);
const tanOfDeductor = extractTAN(fullText);
const totalAmountPaid = extractTotalAmountPaidForm16A(fullText);
const tdsAmounts = extractTDSAmountsForm16A(fullText);
const totalTaxDeducted = tdsAmounts.taxDeducted ?? tdsAmounts.totalRs ?? null;
const totalTdsDeposited = tdsAmounts.taxDeposited ?? tdsAmounts.totalRs ?? totalTaxDeducted ?? null;
const form16aNumber = extractForm16ANumber(fullText);
const assessmentYear = extractAssessmentYear(fullText);
const quarter = extractQuarterForm16A(fullText);
const natureOfPayment = extractNatureOfPayment(fullText);
const transactionDate = extractTransactionDate(fullText);
const statusOfMatchingOltas = extractOltasStatus(fullText);
const certificateDate = extractCertificateDate(fullText);
const dateOfBooking = extractDateOfBooking(fullText);
let financialYear = extractFinancialYear(fullText);
if (!financialYear && assessmentYear) {
const parts = assessmentYear.split(/[-/]/).map((p) => parseInt(p, 10));
if (parts.length === 2 && !Number.isNaN(parts[1])) {
financialYear = `${parts[0] - 1}-${String(parts[1] - 1).padStart(2, '0')}`;
}
}
const parsedDeductor = parseDeductorBlock(nameAndAddressOfDeductor || '');
return {
nameAndAddressOfDeductor,
deductorName: parsedDeductor.name || nameAndAddressOfDeductor,
deductorAddress: parsedDeductor.address ?? null,
deductorPhone: parsedDeductor.phone ?? null,
deductorEmail: parsedDeductor.email ?? null,
totalAmountPaid: totalAmountPaid ?? null,
totalTaxDeducted: totalTaxDeducted ?? null,
totalTdsDeposited: totalTdsDeposited ?? null,
tanOfDeductor,
natureOfPayment: natureOfPayment ?? null,
transactionDate: transactionDate ?? null,
statusOfMatchingOltas: statusOfMatchingOltas ?? null,
dateOfBooking: dateOfBooking ?? null,
assessmentYear: assessmentYear ?? null,
quarter: quarter ?? null,
form16aNumber,
financialYear: financialYear ?? null,
certificateDate,
tanNumber: tanOfDeductor,
tdsAmount: totalTaxDeducted ?? null,
totalAmount: totalAmountPaid ?? null,
};
}
/** Fallback: pdf-parse (v2 PDFParse API) + parseForm16ARawText (REform16-aligned). */
async function fallbackExtraction(filePath: string): Promise<Form16OcrResult> {
try {
const dataBuffer = fs.readFileSync(filePath);
const { PDFParse } = await import('pdf-parse');
const parser = new PDFParse({ data: new Uint8Array(dataBuffer) });
const textResult = await parser.getText();
const text = textResult?.text ?? '';
await parser.destroy();
if (!text || typeof text !== 'string') {
logger.warn('[Form16 OCR] Fallback: no text extracted from PDF');
return { success: false, message: 'No text could be extracted from PDF', error: 'Empty PDF text' };
}
const extracted = parseForm16ARawText(text);
if (extracted.tanOfDeductor && !/^[A-Z]{4}[0-9]{5}[A-Z]{1}$/.test(extracted.tanOfDeductor)) {
extracted.tanOfDeductor = null;
extracted.tanNumber = null;
}
if (extracted.quarter) {
const q = extracted.quarter.toUpperCase().trim();
const qMatch = q.match(/[Q]?([1-4])/);
extracted.quarter = /^Q[1-4]$/.test(q) ? q : (qMatch ? `Q${qMatch[1]}` : null);
}
logger.info('[Form16 OCR] Fallback extraction completed');
return {
success: true,
data: extracted,
method: 'fallback',
ocrProvider: 'Regex fallback',
};
} catch (error: unknown) {
const errMsg = error instanceof Error ? error.message : String(error);
logger.error('[Form16 OCR] Fallback extraction error:', error);
return {
success: false,
error: errMsg,
message: 'Failed to extract data from PDF',
};
}
}
function sanitizeAndCleanGeminiData(extracted: Record<string, unknown>): Form16AExtractedData {
const invalidCertWords = ['contains', 'certificate', 'number', 'nil', 'na', 'n/a', 'none', 'not', 'mentioned', 'see', 'above', 'below', 'refer', 'document', 'form', 'the', 'a', 'an', 'last'];
const isInvalidCertNumber = (s: string | null): boolean => {
if (!s || s.length > 50) return true;
const lower = s.toLowerCase();
if (invalidCertWords.some((w) => lower === w || lower.startsWith(w + ' ') || lower.endsWith(' ' + w))) return true;
if (/\d/.test(s)) return false;
if (/^[A-Z0-9\-]{3,30}$/i.test(s)) return false;
return true;
};
const rawCertNo = getStr(extracted.form16aNumber);
const form16aNumber = rawCertNo && !isInvalidCertNumber(rawCertNo) ? rawCertNo : null;
const sanitizeTds = (n: number | null): number | null => {
if (n == null || n < 0) return null;
if (n >= 100) return n;
if (Number.isInteger(n) && n < 100) return null;
return n;
};
const rawTdsDeducted = getNum(extracted.totalTaxDeducted ?? extracted.tdsAmount);
const rawTdsDeposited = getNum(extracted.totalTdsDeposited ?? extracted.tdsAmount);
const safeTdsDeducted = sanitizeTds(rawTdsDeducted);
const safeTdsDeposited = sanitizeTds(rawTdsDeposited);
const deductorBlock = getStr(extracted.nameAndAddressOfDeductor ?? extracted.deductorName);
const parsedDeductor = parseDeductorBlock(deductorBlock);
const tanStr = getStr(extracted.tanOfDeductor ?? extracted.tanNumber);
let tanUpper: string | null = tanStr ? tanStr.toUpperCase().trim() : null;
if (tanUpper && !/^[A-Z]{4}[0-9]{5}[A-Z]{1}$/.test(tanUpper)) {
tanUpper = null;
}
const quarterRaw = getStr(extracted.quarter);
let quarter: string | null = null;
if (quarterRaw) {
const q = quarterRaw.toUpperCase().trim();
if (/^Q[1-4]$/.test(q)) quarter = q;
else {
const m = q.match(/[Q]?([1-4])/);
if (m) quarter = `Q${m[1]}`;
}
}
return {
nameAndAddressOfDeductor: deductorBlock,
deductorName: getStr(extracted.deductorName ?? parsedDeductor.name) || deductorBlock,
deductorAddress: getStr(extracted.deductorAddress ?? parsedDeductor.address),
deductorPhone: getStr(extracted.deductorPhone ?? parsedDeductor.phone),
deductorEmail: getStr(extracted.deductorEmail ?? parsedDeductor.email),
totalAmountPaid: getNum(extracted.totalAmountPaid ?? extracted.totalAmount),
totalTaxDeducted: safeTdsDeducted,
totalTdsDeposited: safeTdsDeposited,
tanOfDeductor: tanUpper,
natureOfPayment: getStr(extracted.natureOfPayment),
transactionDate: getStr(extracted.transactionDate),
statusOfMatchingOltas: getStr(extracted.statusOfMatchingOltas),
dateOfBooking: getStr(extracted.dateOfBooking),
assessmentYear: getStr(extracted.assessmentYear),
quarter,
form16aNumber,
financialYear: getStr(extracted.financialYear),
certificateDate: getStr(extracted.certificateDate),
tanNumber: tanUpper,
tdsAmount: safeTdsDeducted,
totalAmount: getNum(extracted.totalAmountPaid ?? extracted.totalAmount),
};
}
/** Run Form 16A extraction via Vertex AI (service account). */
async function extractWithVertexAI(filePath: string, fileBase64: string, mimeType: string): Promise<Form16OcrResult> {
const projectId = process.env.GCP_PROJECT_ID?.trim() || 're-platform-workflow-dealer';
const location = process.env.FORM16_VERTEX_LOCATION?.trim() || process.env.VERTEX_AI_LOCATION?.trim() || 'us-central1';
const modelId = process.env.GEMINI_MODEL?.trim() || 'gemini-2.0-flash-lite';
const keyPath = getForm16VertexKeyPath();
if (!keyPath || !fs.existsSync(keyPath)) {
logger.warn('[Form16 OCR] Vertex: no service account key file found. Set GCP_KEY_FILE or GOOGLE_APPLICATION_CREDENTIALS.');
return await fallbackExtraction(filePath);
}
const vertexAI = new VertexAI({
project: projectId,
location,
googleAuthOptions: { keyFilename: keyPath },
});
const generativeModel = vertexAI.getGenerativeModel({
model: modelId,
generationConfig: { temperature: 0.1, topP: 0.95, topK: 40, maxOutputTokens: 8192 },
});
logger.info(`[Form16 OCR] Using Vertex AI (${modelId}, ${location}) for ${path.basename(filePath)}`);
const request = {
contents: [
{
role: 'user',
parts: [
{ text: GEMINI_PROMPT },
{ inlineData: { mimeType, data: fileBase64 } },
],
},
],
};
const response = await generativeModel.generateContent(request);
const candidate = response.response?.candidates?.[0];
const textPart = candidate?.content?.parts?.[0];
const text = textPart && 'text' in textPart ? (textPart as { text: string }).text : '';
if (!text || !text.trim()) {
logger.warn('[Form16 OCR] Vertex AI returned no text, using fallback');
return await fallbackExtraction(filePath);
}
let extractedData: Record<string, unknown>;
try {
const cleaned = text.trim().replace(/```json\s*/g, '').replace(/```\s*/g, '').trim();
const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
if (jsonMatch) extractedData = JSON.parse(jsonMatch[0]) as Record<string, unknown>;
else extractedData = JSON.parse(cleaned) as Record<string, unknown>;
} catch (parseErr) {
logger.warn('[Form16 OCR] Failed to parse Vertex AI JSON, using fallback:', parseErr);
return await fallbackExtraction(filePath);
}
const data = sanitizeAndCleanGeminiData(extractedData);
logger.info('[Form16 OCR] Vertex AI extraction completed successfully');
return {
success: true,
data,
method: 'gemini',
ocrProvider: 'Vertex AI (Gemini)',
};
}
/**
* Extract Form 16A details from PDF: (1) Gemini API key, (2) Vertex AI with service account, (3) regex fallback.
*/
export async function extractForm16ADetails(filePath: string): Promise<Form16OcrResult> {
const fileBuffer = fs.readFileSync(filePath);
const fileBase64 = fileBuffer.toString('base64');
const ext = path.extname(filePath).toLowerCase();
const mimeType = ext === '.pdf' ? 'application/pdf' : 'image/png';
try {
const geminiKey = process.env.GEMINI_API_KEY?.trim();
if (geminiKey) {
const genAI = new GoogleGenerativeAI(geminiKey);
const modelId = process.env.GEMINI_MODEL || 'gemini-2.0-flash';
const model = genAI.getGenerativeModel({
model: modelId,
generationConfig: { temperature: 0.1, topP: 0.95, topK: 40 },
});
logger.info(`[Form16 OCR] Using Gemini API (${modelId}) for ${path.basename(filePath)}`);
const imagePart = { inlineData: { data: fileBase64, mimeType } };
const result = await model.generateContent([GEMINI_PROMPT, imagePart]);
const response = result.response;
if (!response) {
logger.warn('[Form16 OCR] Gemini API returned no response, trying Vertex AI or fallback');
const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType);
if (vertexResult.success) return vertexResult;
return await fallbackExtraction(filePath);
}
let text: string;
try {
text = response.text();
} catch (textErr) {
logger.warn('[Form16 OCR] Gemini API response.text() failed, trying Vertex AI or fallback:', textErr);
const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType);
if (vertexResult.success) return vertexResult;
return await fallbackExtraction(filePath);
}
if (!text || !text.trim()) {
const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType);
if (vertexResult.success) return vertexResult;
return await fallbackExtraction(filePath);
}
let extractedData: Record<string, unknown>;
try {
const cleaned = text.trim().replace(/```json\s*/g, '').replace(/```\s*/g, '').trim();
const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
if (jsonMatch) extractedData = JSON.parse(jsonMatch[0]) as Record<string, unknown>;
else extractedData = JSON.parse(cleaned) as Record<string, unknown>;
} catch (parseErr) {
logger.warn('[Form16 OCR] Failed to parse Gemini API JSON, using fallback:', parseErr);
return await fallbackExtraction(filePath);
}
const data = sanitizeAndCleanGeminiData(extractedData);
return {
success: true,
data,
method: 'gemini',
ocrProvider: 'Google Gemini API',
};
}
// No API key: use Vertex AI with service account
const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType);
if (vertexResult.success) return vertexResult;
logger.warn('[Form16 OCR] Vertex AI failed or unavailable, using regex fallback');
return await fallbackExtraction(filePath);
} catch (error: unknown) {
logger.error('[Form16 OCR] Gemini/Vertex extraction error:', error);
logger.info('[Form16 OCR] Falling back to regex-based extraction');
return await fallbackExtraction(filePath);
}
}