import { calculateMatch, normalizePersonNameExtract } from './utils'; export type RuleExtractHints = { /** MSD fields typed in UI — used to find the same text inside the PDF (no "Name:" label needed). */ msdPayload?: Record; /** When `CSD_PO`, prefer buyer/beneficiary lines (Sold To, Bill To, …) over the first generic `Name:` (often supplier). */ documentType?: string; }; /** * Regex-based extraction logic for CPC-CSD documents. * Provides a lightweight alternative to Gemini for common patterns. * Field names align with MSD payloads from the CPC dashboard (e.g. authority_letter). */ function escapeRegExp(s: string): string { return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } /** If MSD name appears verbatim (spacing flexible) in PDF text, return the matched span. */ function matchMsdNameInBody(body: string, expected: string): string | null { const e = String(expected || '').trim(); if (e.length < 2) return null; const flex = escapeRegExp(e).replace(/\s+/g, '\\s+'); const m = body.match(new RegExp(flex, 'i')); return m ? m[0].replace(/\s+/g, ' ').trim() : null; } /** * Same word as MSD on a line with other text (table cells, "Customer Arjun …") — strict substring match often fails. */ function findMsdNameTokenInOcr(body: string, expected: string): string | null { const h = String(expected || '').trim(); if (h.length < 2 || !body.trim()) return null; const hl = h.toLowerCase(); const noise = /^(qty|ref|date|page|gst|hsn|po|no|id|by|to|of|in|at|sl|sr|index|desc|amount|total)$/i; const lines = body.split(/\r?\n/).map((l) => l.trim()).filter((l) => l.length > 0); for (const line of lines) { if (line.length > 160) continue; if (line.toLowerCase() === hl) return line; const parts = line.split(/[\s,;:|/<>()[\]]+/).filter(Boolean); for (const raw of parts) { const p = raw.replace(/^[^A-Za-z\u0900-\u097F0-9]+|[^A-Za-z\u0900-\u097F0-9]+$/g, ''); if (!p || p.length < 2 || noise.test(p)) continue; if (p.toLowerCase() === hl) return p; } } return null; } /** Pick a short line whose fuzzy score vs MSD is high (authority letters often put name on its own line). */ function pickNameLineByMsd(body: string, expected: string, minScore = 52): string | null { const exp = String(expected || '').trim(); if (exp.length < 2 || !body.trim()) return null; let best: { line: string; score: number } | null = null; const lines = body.split(/\r?\n/).map((l) => l.trim()).filter((l) => l.length > 2 && l.length < 120); for (const line of lines) { if (/^(page|ref|no\.?|date|subject|to|from|dear|sir|madam|annex|schedule|authority|letter|royal|enfield|\d+\s*\/\s*\d+)/i.test(line)) { continue; } const s = calculateMatch(exp, line, 'authorized_person_name'); if (s >= minScore && (!best || s > best.score)) { best = { line, score: s }; } } return best?.line ?? null; } function normalizePan(s: string): string | null { const p = String(s || '') .toUpperCase() .replace(/\s/g, ''); return /^[A-Z]{5}[0-9]{4}[A-Z]$/.test(p) ? p : null; } /** If MSD PAN appears in PDF text, return canonical PAN (OCR may split with spaces). */ function panFromMsdHint(body: string, msdPan: unknown): string | null { const p = normalizePan(String(msdPan ?? '')); if (!p || !body) return null; const compact = body.toUpperCase().replace(/[\s-]/g, ''); return compact.includes(p) ? p : null; } /** If MSD amount digits appear in body, return normalized digit string for range match. */ function invoiceDigitsFromMsdHint(body: string, msdAmt: unknown): string | null { const d = String(msdAmt ?? '').replace(/[^\d.]/g, ''); if (!d || d.length < 1) return null; const intPart = d.split('.')[0]; if (intPart.length >= 2 && body.replace(/[^\d]/g, '').includes(intPart)) { return d; } return null; } /** Supplier / letterhead lines — not the CSD customer individual name. */ const RE_COMPANY_NAME_HINT = /\b(LIMITED|LTD\.?|L\.?\s*L\.?\s*P\.?|PVT\.?\s*LTD|PRIVATE\s+LIMITED|PVT|PTE|INC\.?|CORP|CORPORATION|INDIA\s+LTD|MOTORS?|AUTOMOBILES?|DEALERS?|ENTERPRISES?|SALES\s*(?:&|AND)?\s*SERVICE|WORKS|AGENCIES)\b/i; function looksLikeCompanyLine(s: string): boolean { const x = String(s || '').trim(); if (!x) return false; if (RE_COMPANY_NAME_HINT.test(x)) return true; if (/^[A-Z0-9.&\s\-]{14,}$/.test(x) && !/\s{2,}/.test(x)) return true; return false; } function trimBuyerCapture(raw: string): string { let s = String(raw || '').replace(/\r/g, '').trim(); s = s.replace(/^[:\-–—\s]+/, ''); const cut = s.split(/\b(?:GSTIN|PAN|Phone|Tel|Email|E-?mail|Mob|Mobile|Address|Qty|Quantity|Part)\b/i)[0]; s = (cut ?? s).trim(); return s.replace(/\s+/g, ' ').trim(); } function isCsdPoHints(hints?: RuleExtractHints): boolean { const dt = String(hints?.documentType || '').toUpperCase(); return dt.includes('CSD_PO') || dt.includes('PURCHASE_ORDER'); } /** Many CSD PO line-items print: 16-digit card/UIN then customer name then plot no / address (Description column). */ const RE_VEHICLE_TOKENS = /^(ROYAL|ENFIELD|METEOR|CLASSIC|BULLET|HIMALAYAN|INTERCEPTOR|CONTINENTAL|STELLAR|THUNDER|BS-?VI|BSVI|SUPER|VARIANT|MODEL|CC|HP|ABS|QTY|HSN)$/i; function isPlausibleHumanNameFromPoDescription(s: string): boolean { const x = String(s || '') .replace(/\s+/g, ' ') .trim(); if (x.length < 3 || x.length > 72) return false; const parts = x.split(/\s+/).filter(Boolean); if (parts.length < 1 || parts.length > 6) return false; if (looksLikeCompanyLine(x)) return false; for (const p of parts) { if (RE_VEHICLE_TOKENS.test(p)) return false; } return parts.some((p) => /^[A-Za-z\u0900-\u097F]{2,}$/.test(p)); } /** * Pattern: `5312423002619089 KALAIYARASAN K 71` — 16 digits (optional spaces in groups of 4), * then name tokens, then often a short plot/house number or newline/address. */ function extractCsdPoNameInDescriptionColumn(body: string): string | null { const norm = body.replace(/\r\n/g, '\n').replace(/\u00a0/g, ' '); const digitRes: RegExp[] = [/\b\d{4}\s+\d{4}\s+\d{4}\s+\d{4}\b/g, /\b\d{16}\b/g]; const seenAt = new Set(); for (const re of digitRes) { re.lastIndex = 0; let dm: RegExpExecArray | null; while ((dm = re.exec(norm)) !== null) { const compact = dm[0].replace(/\s/g, ''); if (compact.length !== 16 || !/^\d{16}$/.test(compact)) continue; if (seenAt.has(dm.index)) continue; seenAt.add(dm.index); const tail = norm.slice(dm.index + dm[0].length).replace(/^\s+/, ''); let nm = tail.match( /^([A-Za-z\u0900-\u097F]+(?:\s+[A-Za-z\u0900-\u097F]+){0,5})(?=\s+\d{1,4}\b|\s*\n|\s*$)/i ); if (!nm?.[1]) { const loose = tail.match(/^([A-Za-z\u0900-\u097F]{2,25})\b/i); if (loose?.[1] && isPlausibleHumanNameFromPoDescription(loose[1])) nm = loose; } if (!nm?.[1]) continue; const candidate = nm[1].replace(/\s+/g, ' ').trim(); if (isPlausibleHumanNameFromPoDescription(candidate)) { return candidate; } } } return null; } /** * CSD / defence-style POs usually put the customer under Sold To / Bill To / card holder, * not under the first "Name:" (often dealer contact). */ function extractCsdPoBuyerFromLabels(body: string): string | null { const norm = body.replace(/\r\n/g, '\n'); const patterns: RegExp[] = [ /(?:^|\n)\s*Sold\s*To\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i, /(?:^|\n)\s*Bill\s*To\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i, /(?:^|\n)\s*Ship\s*To\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i, /(?:^|\n)\s*Consignee\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i, /(?:^|\n)\s*(?:Buyer|Purchaser)\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i, /(?:^|\n)\s*Customer\s*(?:Name|Details)?\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i, /(?:^|\n)\s*CSD\s*Card(?:\s*Holder)?\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i, /(?:^|\n)\s*Card\s*Holder(?:\s*Name)?\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i, /(?:^|\n)\s*Beneficiary\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i, /(?:^|\n)\s*Name\s*of\s*(?:the\s*)?(?:Purchaser|Buyer|Customer)\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i, /(?:^|\n)\s*(?:Ordered|Order)\s*(?:By|Placed\s*By)\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i ]; for (const re of patterns) { const m = norm.match(re); if (!m?.[1]) continue; const line = trimBuyerCapture(m[1]); if (line.length < 2 || line.length > 100) continue; if (/^(page|date|amount|total|ref|subject)\b/i.test(line)) continue; return line; } return null; } export class CpcRuleExtractService { /** * If Vertex returned a supplier-style string but OCR shows a clear buyer line, prefer the buyer line. */ static refineCsdPoCustomerName(ocrText: string, customerName: unknown): string | null { const cur = String(customerName ?? '').trim(); const text = String(ocrText || ''); const fromDesc = extractCsdPoNameInDescriptionColumn(text); const fromLabels = extractCsdPoBuyerFromLabels(text); const buyer = fromDesc || fromLabels; if (!buyer) return cur.length >= 2 ? cur : null; if (!cur) return buyer; if (looksLikeCompanyLine(cur) && !looksLikeCompanyLine(buyer)) return buyer; return cur; } static extractWithRules(ocrText: string, hints?: RuleExtractHints) { const t = String(ocrText || ""); const msd = hints?.msdPayload || {}; const isCsdPo = isCsdPoHints(hints); // Matches 12 digit Aadhaar (with optional spaces) const aadhaarMatch = t.match(/\b\d{4}\s?\d{4}\s?\d{4}\b/); // Matches currency patterns const invoiceMatch = t.match(/(?:₹|Rs\.?|INR)\s?[\d,]+(?:\.\d{1,2})?/i); // Matches common order/auth patterns const orderMatch = t.match(/\b(?:PO|ORDER|AUTH|AUTHORIZATION)\s*[:\-]?\s*([A-Z0-9\-\/]{4,})/i); // Matches "Name: [Value]" / "Authorised Person" / applicant-style labels const nameMatch = t.match(/\bName\s*[:\-]\s*([A-Za-z][A-Za-z0-9\s.'-]{2,79})/i); const authPersonMatch = t.match( /\b(?:authorized|authorised)\s+person\s*[:\-]\s*([A-Za-z][A-Za-z0-9\s.'-]{2,79})/i ); const applicantMatch = t.match( /\b(?:applicant|holder|customer|borrower|dealer)\s*[:\-]\s*([A-Za-z][A-Za-z0-9\s.'-]{2,79})/i ); let displayNameRaw = isCsdPo ? extractCsdPoNameInDescriptionColumn(t) || extractCsdPoBuyerFromLabels(t) || '' : ''; if (!displayNameRaw) { displayNameRaw = (authPersonMatch?.[1] || nameMatch?.[1] || applicantMatch?.[1] || '').trim(); } // MSD-guided: name often appears in body exactly as user typed (no label) — same idea as manual compare in CPC-CSD UI flow if (!displayNameRaw) { const fromAuth = msd.customer_name ?? msd.authorized_person_name ?? msd.name; const hint = String(fromAuth ?? '').trim(); if (hint) { const minFuzzy = hint.length <= 10 ? 40 : 52; displayNameRaw = matchMsdNameInBody(t, hint) || findMsdNameTokenInOcr(t, hint) || pickNameLineByMsd(t, hint, minFuzzy) || ''; } } // Title / ALL CAPS line fallback — include short single names (e.g. "Arjun") skipped by older rules if (!displayNameRaw) { const lines = t.split(/\r?\n/).map((l) => l.trim()).filter(Boolean); const noiseLine = /^(qty|ref|date|page|gst|hsn|po|no|id|total|amount|index|desc|sl)$/i; for (const line of lines) { if (line.length < 3 || line.length > 80) continue; if (noiseLine.test(line)) continue; if (/^(ref|date|subject|to|from|dear|page|annex|authority|letter|royal|enfield|cpc|csd)\b/i.test(line)) { continue; } if (isCsdPo && looksLikeCompanyLine(line)) { continue; } const words = line.split(/\s+/).filter(Boolean); const singleName = words.length === 1 && /^[A-Za-z\u0900-\u097F]{2,25}$/.test(words[0]) && !RE_VEHICLE_TOKENS.test(words[0]) && !looksLikeCompanyLine(words[0]); const multiAllCaps = /^[A-Z][A-Z0-9\s.'-]{4,70}$/.test(line) && words.length >= 2; if (singleName || multiAllCaps) { displayNameRaw = line; break; } const titleCaseName = words.length >= 1 && words.length <= 4 && words.every((w) => /^[A-Za-z\u0900-\u097F]{2,}$/.test(w)) && !words.some((w) => RE_VEHICLE_TOKENS.test(w)) && line[0] === line[0].toUpperCase() && /[a-z\u0900-\u097F]/.test(line) && !looksLikeCompanyLine(line); if (titleCaseName && line.length <= 48) { displayNameRaw = line; break; } } } let displayName = displayNameRaw.length >= 2 ? displayNameRaw.replace(/\s+/g, ' ').trim() : null; if (isCsdPo && displayName) { displayName = CpcRuleExtractService.refineCsdPoCustomerName(t, displayName) ?? displayName; } if (displayName) { const n = normalizePersonNameExtract(displayName); if (n) displayName = n; } // PAN (Indian format) + MSD hint (PDF may lack strict word boundaries) let panFromRegex = t.match(/\b([A-Z]{5}[0-9]{4}[A-Z])\b/i); let panVal = panFromRegex ? String(panFromRegex[1]).toUpperCase() : null; if (!panVal && msd.pan_number != null) { panVal = panFromMsdHint(t, msd.pan_number); } // Numeric amount for range matching against MSD invoice_value const amountDigits = invoiceMatch ? String(invoiceMatch[0]).replace(/[^\d.]/g, '').replace(/^\.+|\.+$/g, '') : null; let invoiceValueNormalized = amountDigits && amountDigits.length ? amountDigits : null; if (!invoiceValueNormalized) { invoiceValueNormalized = invoiceDigitsFromMsdHint(t, msd.po_amount) || invoiceDigitsFromMsdHint(t, msd.letter_amount) || invoiceDigitsFromMsdHint(t, msd.invoice_value); } const stampPresent = /(stamp|seal|authorized signatory|signature)/i.test(t); const govtStampPresent = /(govt\.?\s*stamp|government\s*seal|govt\.?\s*signatory|official\s*stamp|authorized\s*signatory)/i.test(t) || stampPresent; const stampYesNo = govtStampPresent ? 'yes' : 'no'; const poOrOrder = orderMatch ? orderMatch[1].trim() : null; const aadhaarDigits = aadhaarMatch ? aadhaarMatch[0].replace(/\s/g, '').trim() : null; return { extracted_fields: { authorized_person_name: displayName, customer_name: displayName, pan_number: panVal, order_or_authorisation_number: poOrOrder, po_number: poOrOrder, order_or_auth_number: poOrOrder, invoice_value: invoiceValueNormalized, po_amount: invoiceValueNormalized, letter_amount: invoiceValueNormalized, aadhaar_number: aadhaarDigits, aadhar_number: aadhaarDigits, stamp_or_signatory_present: stampPresent, stamp_sign_present: stampPresent, govt_signatory_and_stamp_present: stampYesNo, signature_and_stamp: stampYesNo }, field_confidence: { authorized_person_name: displayName ? 0.65 : 0.2, customer_name: displayName ? 0.65 : 0.2, pan_number: panVal ? 0.85 : 0.2, order_or_authorisation_number: orderMatch ? 0.7 : 0.2, po_number: orderMatch ? 0.7 : 0.2, order_or_auth_number: orderMatch ? 0.7 : 0.2, invoice_value: invoiceValueNormalized ? 0.7 : 0.2, po_amount: invoiceValueNormalized ? 0.7 : 0.2, letter_amount: invoiceValueNormalized ? 0.7 : 0.2, aadhaar_number: aadhaarMatch ? 0.85 : 0.2, aadhar_number: aadhaarMatch ? 0.85 : 0.2, stamp_or_signatory_present: stampPresent ? 0.55 : 0.3, stamp_sign_present: stampPresent ? 0.55 : 0.3, govt_signatory_and_stamp_present: govtStampPresent ? 0.55 : 0.3, signature_and_stamp: govtStampPresent ? 0.55 : 0.3 } }; } }