670 lines
29 KiB
TypeScript
670 lines
29 KiB
TypeScript
/**
|
|
* Form 16A OCR: Google Gemini (primary) with regex fallback.
|
|
* Supports (1) Gemini API key, (2) Vertex AI with service account, (3) regex fallback.
|
|
*/
|
|
|
|
import { GoogleGenerativeAI } from '@google/generative-ai';
|
|
import { VertexAI } from '@google-cloud/vertexai';
|
|
import * as fs from 'fs';
|
|
import * as path from 'path';
|
|
import dotenv from 'dotenv';
|
|
|
|
import logger from '../utils/logger';
|
|
|
|
// Ensure .env is loaded (backend may run from different cwd)
|
|
const backendDir = path.join(__dirname, '../..');
|
|
dotenv.config({ path: path.join(backendDir, '.env') });
|
|
|
|
function getForm16VertexKeyPath(): string | null {
|
|
const keyFile = process.env.GCP_KEY_FILE?.trim();
|
|
const creds = process.env.GOOGLE_APPLICATION_CREDENTIALS?.trim();
|
|
if (keyFile) return path.isAbsolute(keyFile) ? keyFile : path.resolve(backendDir, keyFile);
|
|
if (creds) return path.isAbsolute(creds) ? creds : path.resolve(backendDir, creds);
|
|
const defaultPath = path.join(backendDir, 'credentials', 're-platform-workflow-dealer-3d5738fcc1f9.json');
|
|
return fs.existsSync(defaultPath) ? defaultPath : null;
|
|
}
|
|
|
|
export interface Form16AExtractedData {
|
|
nameAndAddressOfDeductor?: string | null;
|
|
deductorName?: string | null;
|
|
deductorAddress?: string | null;
|
|
deductorPhone?: string | null;
|
|
deductorEmail?: string | null;
|
|
totalAmountPaid?: number | null;
|
|
totalTaxDeducted?: number | null;
|
|
totalTdsDeposited?: number | null;
|
|
tanOfDeductor?: string | null;
|
|
natureOfPayment?: string | null;
|
|
transactionDate?: string | null;
|
|
statusOfMatchingOltas?: string | null;
|
|
dateOfBooking?: string | null;
|
|
assessmentYear?: string | null;
|
|
quarter?: string | null;
|
|
form16aNumber?: string | null;
|
|
financialYear?: string | null;
|
|
certificateDate?: string | null;
|
|
tanNumber?: string | null;
|
|
tdsAmount?: number | null;
|
|
totalAmount?: number | null;
|
|
}
|
|
|
|
export interface Form16OcrResult {
|
|
success: boolean;
|
|
data?: Form16AExtractedData;
|
|
method?: 'gemini' | 'fallback';
|
|
ocrProvider?: string;
|
|
error?: string;
|
|
message?: string;
|
|
}
|
|
|
|
const GEMINI_PROMPT = `You are an expert at extracting data from Indian Tax Form 16A certificates (TDS certificate under Section 203).
|
|
|
|
STEP 1 - Read the ENTIRE document: every table, every section, and every line. Form 16A has multiple parts: deductor details, deductee details, and one or more TABLES with payment/TDS figures.
|
|
|
|
STEP 2 - Extract these fields. For amounts, look in TABLES: find rows or columns with these labels and take the NUMBER in the same row/column (ignore ₹, Rs, commas):
|
|
|
|
1. nameAndAddressOfDeductor - "Name and address of the deductor". Full block in one string. Also extract: deductorName (person/entity name only), deductorAddress (street, city, state, PIN), deductorPhone, deductorEmail.
|
|
|
|
2. totalAmountPaid - In "Summary of payment" find "Total(Rs)" or "Amount paid/credited". The LARGE amount (e.g. 181968556.36). Not the TDS amount.
|
|
|
|
3. totalTaxDeducted - "Amount of Tax Deducted in respect of Deductee" in tax summary table. The TDS amount (e.g. 181969.00). Must be hundreds or more, NOT a single digit like 3.
|
|
|
|
4. totalTdsDeposited - "Amount of Tax Deposited / Remitted in respect of Deductee". Same as totalTaxDeducted if one total (e.g. 181969.00). NOT a page number.
|
|
|
|
5. tanOfDeductor - "TAN of the deductor" or "TAN". Must be exactly 10 characters: 4 uppercase letters + 5 digits + 1 letter (e.g. BLRH07660C). No spaces.
|
|
|
|
6. natureOfPayment - "Nature of payment" or "Section" or "Nature of Payment". Value is usually a section code like 194Q, 194A, 194I, or a short description. Extract that code or text.
|
|
|
|
7. transactionDate - "Transaction date" or "Date of payment" or "Period" end date. Format DD-MM-YYYY or DD/MM/YYYY.
|
|
|
|
8. statusOfMatchingOltas - "Status of matching with OLTAS" or "OLTAS". Single letter (F, O, M) or word like "Matched". Extract as shown.
|
|
|
|
9. dateOfBooking - "Date of booking" or "Date of deposit". DD-MM-YYYY or DD/MM/YYYY.
|
|
|
|
10. assessmentYear - "Assessment Year" or "AY" from the form header. Format YYYY-YY (e.g. 2025-26). This is the Form 16A assessment year.
|
|
|
|
11. quarter - "Quarter". Must be Q1, Q2, Q3, or Q4. If you see "Apr-Jun","Jul-Sep","Oct-Dec","Jan-Mar" or "Quarter 1" etc., convert to Q1, Q2, Q3, Q4.
|
|
|
|
12. form16aNumber - "Certificate Number" or "Certificate No" - the alphanumeric code (e.g. LTZKJZA, 12345). Do NOT return "Last" or "updated" or "on" (from "Last updated on"). Only the certificate ID. If unclear, return null.
|
|
|
|
13. financialYear - "Financial Year" or "FY". Format YYYY-YY. Can derive from Assessment Year (AY 2025-26 => FY 2024-25).
|
|
|
|
14. certificateDate - "Date of certificate" or "Last updated on". DD-MM-YYYY. Optional.
|
|
|
|
RULES:
|
|
- Scan every table in the document for amount and TDS figures. The totals are usually in the last row or a row labeled "Total".
|
|
- For amounts: output only a number (e.g. 128234.00), no currency symbol or commas.
|
|
- For form16aNumber: if the value is a single English word (e.g. contains, certificate, number, nil), return null.
|
|
- If a field is truly not in the document, set it to null.
|
|
- Return ONLY a single JSON object, no markdown, no \`\`\`, no explanation.
|
|
|
|
JSON format (use this exact structure):
|
|
{
|
|
"nameAndAddressOfDeductor": "string or null",
|
|
"deductorName": "string or null",
|
|
"deductorAddress": "string or null",
|
|
"deductorPhone": "string or null",
|
|
"deductorEmail": "string or null",
|
|
"totalAmountPaid": number or null,
|
|
"totalTaxDeducted": number or null,
|
|
"totalTdsDeposited": number or null,
|
|
"tanOfDeductor": "string or null",
|
|
"natureOfPayment": "string or null",
|
|
"transactionDate": "string or null",
|
|
"statusOfMatchingOltas": "string or null",
|
|
"dateOfBooking": "string or null",
|
|
"assessmentYear": "string or null",
|
|
"quarter": "string or null",
|
|
"form16aNumber": "string or null",
|
|
"financialYear": "string or null",
|
|
"certificateDate": "string or null"
|
|
}`;
|
|
|
|
// ----- Helpers (aligned with REform16) -----
|
|
function getNum(v: unknown): number | null {
|
|
if (v == null || v === '') return null;
|
|
if (typeof v === 'number' && !Number.isNaN(v)) return v;
|
|
const s = String(v).replace(/,/g, '').replace(/₹|Rs\.?|INR/gi, '').trim();
|
|
const n = parseFloat(s);
|
|
return !Number.isNaN(n) ? n : null;
|
|
}
|
|
|
|
function getStr(v: unknown): string | null {
|
|
if (v != null && String(v).trim() !== '') return String(v).trim();
|
|
return null;
|
|
}
|
|
|
|
function parseDeductorBlock(block: string | null): { name: string | null; address: string | null; phone: string | null; email: string | null } {
|
|
const result = { name: null as string | null, address: null as string | null, phone: null as string | null, email: null as string | null };
|
|
if (!block || typeof block !== 'string') return result;
|
|
const parts = block.split(/[,]+/).map((p) => p.trim()).filter(Boolean);
|
|
const emailPart = parts.find((p) => /@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}/.test(p));
|
|
const phonePart = parts.find((p) => /^[\+]?[(]?[0-9\s\-()]{10,}$/.test(p) || /^\+?91[\s\-]?\d{10}$/.test(p));
|
|
if (emailPart) {
|
|
result.email = emailPart;
|
|
parts.splice(parts.indexOf(emailPart), 1);
|
|
}
|
|
if (phonePart) {
|
|
result.phone = phonePart;
|
|
parts.splice(parts.indexOf(phonePart), 1);
|
|
}
|
|
if (parts.length > 0) {
|
|
result.name = parts[0];
|
|
if (parts.length > 1) result.address = parts.slice(1).join(', ');
|
|
}
|
|
return result;
|
|
}
|
|
|
|
function isValidTdsAmount(n: number | null): boolean {
|
|
if (n == null || Number.isNaN(n) || n < 0) return false;
|
|
if (n >= 100) return true;
|
|
if (Number.isInteger(n) && n < 100) return false;
|
|
return true;
|
|
}
|
|
|
|
function extractTotalAmountPaidForm16A(text: string): number | null {
|
|
const m = text.match(/Summary\s+of\s+payment[\s\S]*?Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i);
|
|
if (m?.[1]) {
|
|
const n = parseFloat(m[1].replace(/,/g, ''));
|
|
if (!Number.isNaN(n) && n > 0) return n;
|
|
}
|
|
const m2 = text.match(/Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i);
|
|
if (m2?.[1]) {
|
|
const n = parseFloat(m2[1].replace(/,/g, ''));
|
|
if (!Number.isNaN(n) && n > 0) return n;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function extractTDSAmountsForm16A(text: string): { taxDeducted: number | null; taxDeposited: number | null; totalRs: number | null } {
|
|
const result = { taxDeducted: null as number | null, taxDeposited: null as number | null, totalRs: null as number | null };
|
|
const quarterLine = text.match(/Q[1-4][A-Z0-9]*([0-9,]+\.\d{2})([0-9,]+\.\d{2})/);
|
|
if (quarterLine?.[1] && quarterLine?.[2]) {
|
|
const a1 = parseFloat(quarterLine[1].replace(/,/g, ''));
|
|
const a2 = parseFloat(quarterLine[2].replace(/,/g, ''));
|
|
if (isValidTdsAmount(a1)) result.taxDeducted = a1;
|
|
if (isValidTdsAmount(a2)) result.taxDeposited = a2;
|
|
}
|
|
const taxDeductedM = text.match(/Amount\s+of\s+Tax\s+Deducted[\s\S]*?([0-9,]{3,}\.?[0-9]*|[0-9,]+\.\d{2})\s*(?:Amount|Deductee|$)/i);
|
|
if (taxDeductedM?.[1] && !result.taxDeducted) {
|
|
const n = parseFloat(taxDeductedM[1].replace(/,/g, ''));
|
|
if (isValidTdsAmount(n)) result.taxDeducted = n;
|
|
}
|
|
const taxDepositedM = text.match(/Amount\s+of\s+Tax\s+Deposited[\s\S]*?([0-9,]{3,}\.?[0-9]*|[0-9,]+\.\d{2})/i);
|
|
if (taxDepositedM?.[1] && !result.taxDeposited) {
|
|
const n = parseFloat(taxDepositedM[1].replace(/,/g, ''));
|
|
if (isValidTdsAmount(n)) result.taxDeposited = n;
|
|
}
|
|
const totalRsM = text.match(/Status\s+of\s+matching[\s\S]*?Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i)
|
|
|| text.match(/OLTAS[\s\S]*?Total\s*\(Rs\.?\)\s*([0-9,]+\.?[0-9]*)/i);
|
|
if (totalRsM?.[1]) {
|
|
const n = parseFloat(totalRsM[1].replace(/,/g, ''));
|
|
if (isValidTdsAmount(n)) result.totalRs = n;
|
|
}
|
|
const rsDeductedM = text.match(/sum\s+of\s+Rs\.?\s*([0-9,]+\.?[0-9]*)\s*\[?Rs\.?\s*[Oo]ne\s+[Ll]akh/i);
|
|
if (rsDeductedM?.[1] && !result.taxDeducted) {
|
|
const n = parseFloat(rsDeductedM[1].replace(/,/g, ''));
|
|
if (isValidTdsAmount(n)) result.taxDeducted = n;
|
|
}
|
|
const rsDepositedM = text.match(/deposited\s+[a-z\s]+Rs\.?\s*([0-9,]+\.?[0-9]*)\s*\[?Rs\.?\s*[Oo]ne/i);
|
|
if (rsDepositedM?.[1] && !result.taxDeposited) {
|
|
const n = parseFloat(rsDepositedM[1].replace(/,/g, ''));
|
|
if (isValidTdsAmount(n)) result.taxDeposited = n;
|
|
}
|
|
if (result.taxDeducted != null && !isValidTdsAmount(result.taxDeducted)) result.taxDeducted = null;
|
|
if (result.taxDeposited != null && !isValidTdsAmount(result.taxDeposited)) result.taxDeposited = null;
|
|
if (result.totalRs != null && !isValidTdsAmount(result.totalRs)) result.totalRs = null;
|
|
return result;
|
|
}
|
|
|
|
function extractQuarterForm16A(text: string): string | null {
|
|
const m = text.match(/\bQuarter\s*[:\s]*\n?\s*(Q[1-4])/i) || text.match(/\b(Q[1-4])[A-Z0-9]*\s*[0-9]/);
|
|
if (m?.[1]) return m[1].toUpperCase();
|
|
return extractQuarter(text);
|
|
}
|
|
|
|
function extractQuarter(text: string): string | null {
|
|
const patterns = [
|
|
/Quarter[:\s]*([1-4])/i,
|
|
/Q[:\s]*([1-4])/i,
|
|
/([1-4])\s*Quarter/i,
|
|
];
|
|
for (const pattern of patterns) {
|
|
const m = text.match(pattern);
|
|
if (m?.[1]) return `Q${m[1]}`;
|
|
}
|
|
const dateMatch = text.match(/(Apr|April|May|Jun|June|Jul|July|Aug|August|Sep|September|Oct|October|Nov|November|Dec|December|Jan|January|Feb|February|Mar|March)/i);
|
|
if (dateMatch) {
|
|
const month = dateMatch[1].toLowerCase();
|
|
if (['apr', 'april', 'may', 'jun', 'june'].includes(month)) return 'Q1';
|
|
if (['jul', 'july', 'aug', 'august', 'sep', 'september'].includes(month)) return 'Q2';
|
|
if (['oct', 'october', 'nov', 'november', 'dec', 'december'].includes(month)) return 'Q3';
|
|
if (['jan', 'january', 'feb', 'february', 'mar', 'march'].includes(month)) return 'Q4';
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function extractNatureOfPayment(text: string): string | null {
|
|
const m = text.match(/\.(\d{2})\s*(19[4-9][A-Z]?|20[0-6][A-Z]?)\s*\d{2}-/);
|
|
if (m?.[2]) return m[2];
|
|
const m2 = text.match(/Nature\s+of\s+payment[\s\S]*?(19[4-9][A-Z]?|20[0-6][A-Z]?)/i);
|
|
if (m2?.[1]) return m2[1];
|
|
const m3 = text.match(/\b(19[4-9][A-Z]|20[0-6][A-Z])\b/);
|
|
return m3?.[1] ?? null;
|
|
}
|
|
|
|
function extractTransactionDate(text: string): string | null {
|
|
const m = text.match(/Period\s*From\s*(\d{1,2}-[A-Za-z]{3}-\d{4})\s*To\s*(\d{1,2}-[A-Za-z]{3}-\d{4})/i);
|
|
if (m?.[2]) return m[2];
|
|
const d = text.match(/(\d{1,2}-\d{1,2}-\d{4})/g);
|
|
return d?.[0] ?? null;
|
|
}
|
|
|
|
function extractOltasStatus(text: string): string | null {
|
|
const m = text.match(/Status\s+of\s+matching\s+with\s+OLTAS[\s\S]*?(\d{2}-\d{2}-\d{4})\s*(\d+)\s*([FOMUP])/i)
|
|
|| text.match(/([FOMUP])\s*Final\s*|([FOMUP])\s*Unmatched/i)
|
|
|| text.match(/\d{2}-\d{2}-\d{4}\s*\d+\s*([FOMUP])/);
|
|
if (m) return (m[3] || m[1] || m[2] || '').toUpperCase();
|
|
return null;
|
|
}
|
|
|
|
function extractDateOfBooking(text: string): string | null {
|
|
const m = text.match(/Date\s+on\s+which\s+tax\s+deposited[\s\S]*?(\d{1,2}-\d{1,2}-\d{4})/i)
|
|
|| text.match(/Challan[\s\S]*?(\d{1,2}-\d{1,2}-\d{4})/i);
|
|
return m?.[1] ?? null;
|
|
}
|
|
|
|
function extractForm16ANumber(text: string): string | null {
|
|
const invalidWords = ['contains', 'certificate', 'number', 'nil', 'na', 'n/a', 'none', 'last'];
|
|
const patterns = [
|
|
/Certificate\s*No\.?[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i,
|
|
/Form\s*16A\s*No\.?[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i,
|
|
/Form\s*16A[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i,
|
|
/Certificate\s*number[:\s]*([A-Z0-9][A-Z0-9\-]{2,30})/i,
|
|
];
|
|
for (const pattern of patterns) {
|
|
const match = text.match(pattern);
|
|
if (match?.[1]) {
|
|
const val = match[1].trim();
|
|
if (invalidWords.includes(val.toLowerCase()) || val.length < 3) continue;
|
|
if (/\d/.test(val)) return val;
|
|
if (/^[A-Z0-9\-]{3,30}$/i.test(val)) return val;
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function extractTAN(text: string): string | null {
|
|
const patterns = [
|
|
/TAN[:\s]*([A-Z]{4}[0-9]{5}[A-Z]{1})/i,
|
|
/Tax\s*Deduction\s*Account\s*Number[:\s]*([A-Z]{4}[0-9]{5}[A-Z]{1})/i,
|
|
/([A-Z]{4}[0-9]{5}[A-Z]{1})/g,
|
|
];
|
|
for (const pattern of patterns) {
|
|
const match = text.match(pattern);
|
|
if (match?.[1]) return match[1].trim().toUpperCase();
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function extractDeductorName(text: string): string | null {
|
|
const patterns = [
|
|
/Deductor[:\s]*([A-Z][A-Za-z\s&.,]+)/i,
|
|
/Name\s*of\s*Deductor[:\s]*([A-Z][A-Za-z\s&.,]+)/i,
|
|
/Company\s*Name[:\s]*([A-Z][A-Za-z\s&.,]+)/i,
|
|
];
|
|
for (const pattern of patterns) {
|
|
const match = text.match(pattern);
|
|
if (match?.[1]) return match[1].trim();
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function extractFinancialYear(text: string): string | null {
|
|
const patterns = [
|
|
/Financial\s*Year[:\s]*([0-9]{4}[-/][0-9]{2,4})/i,
|
|
/FY[:\s]*([0-9]{4}[-/][0-9]{2,4})/i,
|
|
/([0-9]{4}[-/][0-9]{2,4})/,
|
|
];
|
|
for (const pattern of patterns) {
|
|
const match = text.match(pattern);
|
|
if (match?.[1]) return match[1].trim();
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function extractAssessmentYear(text: string): string | null {
|
|
const fyMatch = extractFinancialYear(text);
|
|
if (fyMatch) {
|
|
const parts = fyMatch.split(/[-/]/);
|
|
if (parts.length === 2) {
|
|
const startYear = parseInt(parts[0], 10);
|
|
return `${startYear + 1}-${(startYear + 2).toString().slice(-2)}`;
|
|
}
|
|
}
|
|
const patterns = [
|
|
/Assessment\s*Year[:\s]*([0-9]{4}[-/][0-9]{2,4})/i,
|
|
/AY[:\s]*([0-9]{4}[-/][0-9]{2,4})/i,
|
|
];
|
|
for (const pattern of patterns) {
|
|
const match = text.match(pattern);
|
|
if (match?.[1]) return match[1].trim();
|
|
}
|
|
return null;
|
|
}
|
|
|
|
function extractCertificateDate(text: string): string | null {
|
|
const patterns = [
|
|
/Certificate\s*Date[:\s]*([0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{4})/i,
|
|
/Date[:\s]*([0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{4})/i,
|
|
/Issued\s*on[:\s]*([0-9]{1,2}[-/][0-9]{1,2}[-/][0-9]{4})/i,
|
|
];
|
|
for (const pattern of patterns) {
|
|
const match = text.match(pattern);
|
|
if (match?.[1]) return match[1].trim();
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/** Parse Form 16A raw text (REform16-aligned). */
|
|
function parseForm16ARawText(text: string): Form16AExtractedData {
|
|
const lines = text.split(/\r?\n/).map((l) => l.trim()).filter(Boolean);
|
|
const fullText = lines.join('\n');
|
|
|
|
let nameAndAddressOfDeductor: string | null = null;
|
|
const deductorStart = fullText.search(/Name\s+and\s+address\s+of\s+the\s+deductor/i);
|
|
const deductorEnd = fullText.search(/Name\s+and\s+address\s+of\s+the\s+deductee|PAN\s+of\s+the\s+deductor/i);
|
|
if (deductorStart !== -1 && deductorEnd !== -1 && deductorEnd > deductorStart) {
|
|
const block = fullText.slice(deductorStart, deductorEnd);
|
|
const afterLabel = block.replace(/Name\s+and\s+address\s+of\s+the\s+deductor\s*/i, '').trim();
|
|
nameAndAddressOfDeductor = afterLabel.split(/\n/).map((l) => l.trim()).filter(Boolean).join(', ') || null;
|
|
}
|
|
if (!nameAndAddressOfDeductor) nameAndAddressOfDeductor = extractDeductorName(fullText);
|
|
|
|
const tanOfDeductor = extractTAN(fullText);
|
|
const totalAmountPaid = extractTotalAmountPaidForm16A(fullText);
|
|
const tdsAmounts = extractTDSAmountsForm16A(fullText);
|
|
const totalTaxDeducted = tdsAmounts.taxDeducted ?? tdsAmounts.totalRs ?? null;
|
|
const totalTdsDeposited = tdsAmounts.taxDeposited ?? tdsAmounts.totalRs ?? totalTaxDeducted ?? null;
|
|
const form16aNumber = extractForm16ANumber(fullText);
|
|
const assessmentYear = extractAssessmentYear(fullText);
|
|
const quarter = extractQuarterForm16A(fullText);
|
|
const natureOfPayment = extractNatureOfPayment(fullText);
|
|
const transactionDate = extractTransactionDate(fullText);
|
|
const statusOfMatchingOltas = extractOltasStatus(fullText);
|
|
const certificateDate = extractCertificateDate(fullText);
|
|
const dateOfBooking = extractDateOfBooking(fullText);
|
|
let financialYear = extractFinancialYear(fullText);
|
|
if (!financialYear && assessmentYear) {
|
|
const parts = assessmentYear.split(/[-/]/).map((p) => parseInt(p, 10));
|
|
if (parts.length === 2 && !Number.isNaN(parts[1])) {
|
|
financialYear = `${parts[0] - 1}-${String(parts[1] - 1).padStart(2, '0')}`;
|
|
}
|
|
}
|
|
|
|
const parsedDeductor = parseDeductorBlock(nameAndAddressOfDeductor || '');
|
|
return {
|
|
nameAndAddressOfDeductor,
|
|
deductorName: parsedDeductor.name || nameAndAddressOfDeductor,
|
|
deductorAddress: parsedDeductor.address ?? null,
|
|
deductorPhone: parsedDeductor.phone ?? null,
|
|
deductorEmail: parsedDeductor.email ?? null,
|
|
totalAmountPaid: totalAmountPaid ?? null,
|
|
totalTaxDeducted: totalTaxDeducted ?? null,
|
|
totalTdsDeposited: totalTdsDeposited ?? null,
|
|
tanOfDeductor,
|
|
natureOfPayment: natureOfPayment ?? null,
|
|
transactionDate: transactionDate ?? null,
|
|
statusOfMatchingOltas: statusOfMatchingOltas ?? null,
|
|
dateOfBooking: dateOfBooking ?? null,
|
|
assessmentYear: assessmentYear ?? null,
|
|
quarter: quarter ?? null,
|
|
form16aNumber,
|
|
financialYear: financialYear ?? null,
|
|
certificateDate,
|
|
tanNumber: tanOfDeductor,
|
|
tdsAmount: totalTaxDeducted ?? null,
|
|
totalAmount: totalAmountPaid ?? null,
|
|
};
|
|
}
|
|
|
|
/** Fallback: pdf-parse (v2 PDFParse API) + parseForm16ARawText (REform16-aligned). */
|
|
async function fallbackExtraction(filePath: string): Promise<Form16OcrResult> {
|
|
try {
|
|
const dataBuffer = fs.readFileSync(filePath);
|
|
const { PDFParse } = await import('pdf-parse');
|
|
const parser = new PDFParse({ data: new Uint8Array(dataBuffer) });
|
|
const textResult = await parser.getText();
|
|
const text = textResult?.text ?? '';
|
|
await parser.destroy();
|
|
if (!text || typeof text !== 'string') {
|
|
logger.warn('[Form16 OCR] Fallback: no text extracted from PDF');
|
|
return { success: false, message: 'No text could be extracted from PDF', error: 'Empty PDF text' };
|
|
}
|
|
const extracted = parseForm16ARawText(text);
|
|
if (extracted.tanOfDeductor && !/^[A-Z]{4}[0-9]{5}[A-Z]{1}$/.test(extracted.tanOfDeductor)) {
|
|
extracted.tanOfDeductor = null;
|
|
extracted.tanNumber = null;
|
|
}
|
|
if (extracted.quarter) {
|
|
const q = extracted.quarter.toUpperCase().trim();
|
|
const qMatch = q.match(/[Q]?([1-4])/);
|
|
extracted.quarter = /^Q[1-4]$/.test(q) ? q : (qMatch ? `Q${qMatch[1]}` : null);
|
|
}
|
|
logger.info('[Form16 OCR] Fallback extraction completed');
|
|
return {
|
|
success: true,
|
|
data: extracted,
|
|
method: 'fallback',
|
|
ocrProvider: 'Regex fallback',
|
|
};
|
|
} catch (error: unknown) {
|
|
const errMsg = error instanceof Error ? error.message : String(error);
|
|
logger.error('[Form16 OCR] Fallback extraction error:', error);
|
|
return {
|
|
success: false,
|
|
error: errMsg,
|
|
message: 'Failed to extract data from PDF',
|
|
};
|
|
}
|
|
}
|
|
|
|
function sanitizeAndCleanGeminiData(extracted: Record<string, unknown>): Form16AExtractedData {
|
|
const invalidCertWords = ['contains', 'certificate', 'number', 'nil', 'na', 'n/a', 'none', 'not', 'mentioned', 'see', 'above', 'below', 'refer', 'document', 'form', 'the', 'a', 'an', 'last'];
|
|
const isInvalidCertNumber = (s: string | null): boolean => {
|
|
if (!s || s.length > 50) return true;
|
|
const lower = s.toLowerCase();
|
|
if (invalidCertWords.some((w) => lower === w || lower.startsWith(w + ' ') || lower.endsWith(' ' + w))) return true;
|
|
if (/\d/.test(s)) return false;
|
|
if (/^[A-Z0-9\-]{3,30}$/i.test(s)) return false;
|
|
return true;
|
|
};
|
|
const rawCertNo = getStr(extracted.form16aNumber);
|
|
const form16aNumber = rawCertNo && !isInvalidCertNumber(rawCertNo) ? rawCertNo : null;
|
|
|
|
const sanitizeTds = (n: number | null): number | null => {
|
|
if (n == null || n < 0) return null;
|
|
if (n >= 100) return n;
|
|
if (Number.isInteger(n) && n < 100) return null;
|
|
return n;
|
|
};
|
|
const rawTdsDeducted = getNum(extracted.totalTaxDeducted ?? extracted.tdsAmount);
|
|
const rawTdsDeposited = getNum(extracted.totalTdsDeposited ?? extracted.tdsAmount);
|
|
const safeTdsDeducted = sanitizeTds(rawTdsDeducted);
|
|
const safeTdsDeposited = sanitizeTds(rawTdsDeposited);
|
|
|
|
const deductorBlock = getStr(extracted.nameAndAddressOfDeductor ?? extracted.deductorName);
|
|
const parsedDeductor = parseDeductorBlock(deductorBlock);
|
|
const tanStr = getStr(extracted.tanOfDeductor ?? extracted.tanNumber);
|
|
let tanUpper: string | null = tanStr ? tanStr.toUpperCase().trim() : null;
|
|
if (tanUpper && !/^[A-Z]{4}[0-9]{5}[A-Z]{1}$/.test(tanUpper)) {
|
|
tanUpper = null;
|
|
}
|
|
const quarterRaw = getStr(extracted.quarter);
|
|
let quarter: string | null = null;
|
|
if (quarterRaw) {
|
|
const q = quarterRaw.toUpperCase().trim();
|
|
if (/^Q[1-4]$/.test(q)) quarter = q;
|
|
else {
|
|
const m = q.match(/[Q]?([1-4])/);
|
|
if (m) quarter = `Q${m[1]}`;
|
|
}
|
|
}
|
|
|
|
return {
|
|
nameAndAddressOfDeductor: deductorBlock,
|
|
deductorName: getStr(extracted.deductorName ?? parsedDeductor.name) || deductorBlock,
|
|
deductorAddress: getStr(extracted.deductorAddress ?? parsedDeductor.address),
|
|
deductorPhone: getStr(extracted.deductorPhone ?? parsedDeductor.phone),
|
|
deductorEmail: getStr(extracted.deductorEmail ?? parsedDeductor.email),
|
|
totalAmountPaid: getNum(extracted.totalAmountPaid ?? extracted.totalAmount),
|
|
totalTaxDeducted: safeTdsDeducted,
|
|
totalTdsDeposited: safeTdsDeposited,
|
|
tanOfDeductor: tanUpper,
|
|
natureOfPayment: getStr(extracted.natureOfPayment),
|
|
transactionDate: getStr(extracted.transactionDate),
|
|
statusOfMatchingOltas: getStr(extracted.statusOfMatchingOltas),
|
|
dateOfBooking: getStr(extracted.dateOfBooking),
|
|
assessmentYear: getStr(extracted.assessmentYear),
|
|
quarter,
|
|
form16aNumber,
|
|
financialYear: getStr(extracted.financialYear),
|
|
certificateDate: getStr(extracted.certificateDate),
|
|
tanNumber: tanUpper,
|
|
tdsAmount: safeTdsDeducted,
|
|
totalAmount: getNum(extracted.totalAmountPaid ?? extracted.totalAmount),
|
|
};
|
|
}
|
|
|
|
/** Run Form 16A extraction via Vertex AI (service account). */
|
|
async function extractWithVertexAI(filePath: string, fileBase64: string, mimeType: string): Promise<Form16OcrResult> {
|
|
const projectId = process.env.GCP_PROJECT_ID?.trim() || 're-platform-workflow-dealer';
|
|
const location = process.env.FORM16_VERTEX_LOCATION?.trim() || process.env.VERTEX_AI_LOCATION?.trim() || 'us-central1';
|
|
const modelId = process.env.GEMINI_MODEL?.trim() || 'gemini-2.0-flash-lite';
|
|
const keyPath = getForm16VertexKeyPath();
|
|
if (!keyPath || !fs.existsSync(keyPath)) {
|
|
logger.warn('[Form16 OCR] Vertex: no service account key file found. Set GCP_KEY_FILE or GOOGLE_APPLICATION_CREDENTIALS.');
|
|
return await fallbackExtraction(filePath);
|
|
}
|
|
const vertexAI = new VertexAI({
|
|
project: projectId,
|
|
location,
|
|
googleAuthOptions: { keyFilename: keyPath },
|
|
});
|
|
const generativeModel = vertexAI.getGenerativeModel({
|
|
model: modelId,
|
|
generationConfig: { temperature: 0.1, topP: 0.95, topK: 40, maxOutputTokens: 8192 },
|
|
});
|
|
logger.info(`[Form16 OCR] Using Vertex AI (${modelId}, ${location}) for ${path.basename(filePath)}`);
|
|
const request = {
|
|
contents: [
|
|
{
|
|
role: 'user',
|
|
parts: [
|
|
{ text: GEMINI_PROMPT },
|
|
{ inlineData: { mimeType, data: fileBase64 } },
|
|
],
|
|
},
|
|
],
|
|
};
|
|
const response = await generativeModel.generateContent(request);
|
|
const candidate = response.response?.candidates?.[0];
|
|
const textPart = candidate?.content?.parts?.[0];
|
|
const text = textPart && 'text' in textPart ? (textPart as { text: string }).text : '';
|
|
if (!text || !text.trim()) {
|
|
logger.warn('[Form16 OCR] Vertex AI returned no text, using fallback');
|
|
return await fallbackExtraction(filePath);
|
|
}
|
|
let extractedData: Record<string, unknown>;
|
|
try {
|
|
const cleaned = text.trim().replace(/```json\s*/g, '').replace(/```\s*/g, '').trim();
|
|
const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
|
|
if (jsonMatch) extractedData = JSON.parse(jsonMatch[0]) as Record<string, unknown>;
|
|
else extractedData = JSON.parse(cleaned) as Record<string, unknown>;
|
|
} catch (parseErr) {
|
|
logger.warn('[Form16 OCR] Failed to parse Vertex AI JSON, using fallback:', parseErr);
|
|
return await fallbackExtraction(filePath);
|
|
}
|
|
const data = sanitizeAndCleanGeminiData(extractedData);
|
|
logger.info('[Form16 OCR] Vertex AI extraction completed successfully');
|
|
return {
|
|
success: true,
|
|
data,
|
|
method: 'gemini',
|
|
ocrProvider: 'Vertex AI (Gemini)',
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Extract Form 16A details from PDF: (1) Gemini API key, (2) Vertex AI with service account, (3) regex fallback.
|
|
*/
|
|
export async function extractForm16ADetails(filePath: string): Promise<Form16OcrResult> {
|
|
const fileBuffer = fs.readFileSync(filePath);
|
|
const fileBase64 = fileBuffer.toString('base64');
|
|
const ext = path.extname(filePath).toLowerCase();
|
|
const mimeType = ext === '.pdf' ? 'application/pdf' : 'image/png';
|
|
|
|
try {
|
|
const geminiKey = process.env.GEMINI_API_KEY?.trim();
|
|
if (geminiKey) {
|
|
const genAI = new GoogleGenerativeAI(geminiKey);
|
|
const modelId = process.env.GEMINI_MODEL || 'gemini-2.0-flash';
|
|
const model = genAI.getGenerativeModel({
|
|
model: modelId,
|
|
generationConfig: { temperature: 0.1, topP: 0.95, topK: 40 },
|
|
});
|
|
logger.info(`[Form16 OCR] Using Gemini API (${modelId}) for ${path.basename(filePath)}`);
|
|
const imagePart = { inlineData: { data: fileBase64, mimeType } };
|
|
const result = await model.generateContent([GEMINI_PROMPT, imagePart]);
|
|
const response = result.response;
|
|
if (!response) {
|
|
logger.warn('[Form16 OCR] Gemini API returned no response, trying Vertex AI or fallback');
|
|
const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType);
|
|
if (vertexResult.success) return vertexResult;
|
|
return await fallbackExtraction(filePath);
|
|
}
|
|
let text: string;
|
|
try {
|
|
text = response.text();
|
|
} catch (textErr) {
|
|
logger.warn('[Form16 OCR] Gemini API response.text() failed, trying Vertex AI or fallback:', textErr);
|
|
const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType);
|
|
if (vertexResult.success) return vertexResult;
|
|
return await fallbackExtraction(filePath);
|
|
}
|
|
if (!text || !text.trim()) {
|
|
const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType);
|
|
if (vertexResult.success) return vertexResult;
|
|
return await fallbackExtraction(filePath);
|
|
}
|
|
let extractedData: Record<string, unknown>;
|
|
try {
|
|
const cleaned = text.trim().replace(/```json\s*/g, '').replace(/```\s*/g, '').trim();
|
|
const jsonMatch = cleaned.match(/\{[\s\S]*\}/);
|
|
if (jsonMatch) extractedData = JSON.parse(jsonMatch[0]) as Record<string, unknown>;
|
|
else extractedData = JSON.parse(cleaned) as Record<string, unknown>;
|
|
} catch (parseErr) {
|
|
logger.warn('[Form16 OCR] Failed to parse Gemini API JSON, using fallback:', parseErr);
|
|
return await fallbackExtraction(filePath);
|
|
}
|
|
const data = sanitizeAndCleanGeminiData(extractedData);
|
|
return {
|
|
success: true,
|
|
data,
|
|
method: 'gemini',
|
|
ocrProvider: 'Google Gemini API',
|
|
};
|
|
}
|
|
|
|
// No API key: use Vertex AI with service account
|
|
const vertexResult = await extractWithVertexAI(filePath, fileBase64, mimeType);
|
|
if (vertexResult.success) return vertexResult;
|
|
logger.warn('[Form16 OCR] Vertex AI failed or unavailable, using regex fallback');
|
|
return await fallbackExtraction(filePath);
|
|
} catch (error: unknown) {
|
|
logger.error('[Form16 OCR] Gemini/Vertex extraction error:', error);
|
|
logger.info('[Form16 OCR] Falling back to regex-based extraction');
|
|
return await fallbackExtraction(filePath);
|
|
}
|
|
}
|