Re_Backend/src/services/cpc-cdc/CpcRuleExtractService.ts
2026-04-20 20:11:11 +05:30

376 lines
17 KiB
TypeScript

import { calculateMatch, normalizePersonNameExtract } from './utils';
export type RuleExtractHints = {
/** MSD fields typed in UI — used to find the same text inside the PDF (no "Name:" label needed). */
msdPayload?: Record<string, unknown>;
/** When `CSD_PO`, prefer buyer/beneficiary lines (Sold To, Bill To, …) over the first generic `Name:` (often supplier). */
documentType?: string;
};
/**
* Regex-based extraction logic for CPC-CSD documents.
* Provides a lightweight alternative to Gemini for common patterns.
* Field names align with MSD payloads from the CPC dashboard (e.g. authority_letter).
*/
function escapeRegExp(s: string): string {
return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
/** If MSD name appears verbatim (spacing flexible) in PDF text, return the matched span. */
function matchMsdNameInBody(body: string, expected: string): string | null {
const e = String(expected || '').trim();
if (e.length < 2) return null;
const flex = escapeRegExp(e).replace(/\s+/g, '\\s+');
const m = body.match(new RegExp(flex, 'i'));
return m ? m[0].replace(/\s+/g, ' ').trim() : null;
}
/**
* Same word as MSD on a line with other text (table cells, "Customer Arjun …") — strict substring match often fails.
*/
function findMsdNameTokenInOcr(body: string, expected: string): string | null {
const h = String(expected || '').trim();
if (h.length < 2 || !body.trim()) return null;
const hl = h.toLowerCase();
const noise = /^(qty|ref|date|page|gst|hsn|po|no|id|by|to|of|in|at|sl|sr|index|desc|amount|total)$/i;
const lines = body.split(/\r?\n/).map((l) => l.trim()).filter((l) => l.length > 0);
for (const line of lines) {
if (line.length > 160) continue;
if (line.toLowerCase() === hl) return line;
const parts = line.split(/[\s,;:|/<>()[\]]+/).filter(Boolean);
for (const raw of parts) {
const p = raw.replace(/^[^A-Za-z\u0900-\u097F0-9]+|[^A-Za-z\u0900-\u097F0-9]+$/g, '');
if (!p || p.length < 2 || noise.test(p)) continue;
if (p.toLowerCase() === hl) return p;
}
}
return null;
}
/** Pick a short line whose fuzzy score vs MSD is high (authority letters often put name on its own line). */
function pickNameLineByMsd(body: string, expected: string, minScore = 52): string | null {
const exp = String(expected || '').trim();
if (exp.length < 2 || !body.trim()) return null;
let best: { line: string; score: number } | null = null;
const lines = body.split(/\r?\n/).map((l) => l.trim()).filter((l) => l.length > 2 && l.length < 120);
for (const line of lines) {
if (/^(page|ref|no\.?|date|subject|to|from|dear|sir|madam|annex|schedule|authority|letter|royal|enfield|\d+\s*\/\s*\d+)/i.test(line)) {
continue;
}
const s = calculateMatch(exp, line, 'authorized_person_name');
if (s >= minScore && (!best || s > best.score)) {
best = { line, score: s };
}
}
return best?.line ?? null;
}
function normalizePan(s: string): string | null {
const p = String(s || '')
.toUpperCase()
.replace(/\s/g, '');
return /^[A-Z]{5}[0-9]{4}[A-Z]$/.test(p) ? p : null;
}
/** If MSD PAN appears in PDF text, return canonical PAN (OCR may split with spaces). */
function panFromMsdHint(body: string, msdPan: unknown): string | null {
const p = normalizePan(String(msdPan ?? ''));
if (!p || !body) return null;
const compact = body.toUpperCase().replace(/[\s-]/g, '');
return compact.includes(p) ? p : null;
}
/** If MSD amount digits appear in body, return normalized digit string for range match. */
function invoiceDigitsFromMsdHint(body: string, msdAmt: unknown): string | null {
const d = String(msdAmt ?? '').replace(/[^\d.]/g, '');
if (!d || d.length < 1) return null;
const intPart = d.split('.')[0];
if (intPart.length >= 2 && body.replace(/[^\d]/g, '').includes(intPart)) {
return d;
}
return null;
}
/** Supplier / letterhead lines — not the CSD customer individual name. */
const RE_COMPANY_NAME_HINT =
/\b(LIMITED|LTD\.?|L\.?\s*L\.?\s*P\.?|PVT\.?\s*LTD|PRIVATE\s+LIMITED|PVT|PTE|INC\.?|CORP|CORPORATION|INDIA\s+LTD|MOTORS?|AUTOMOBILES?|DEALERS?|ENTERPRISES?|SALES\s*(?:&|AND)?\s*SERVICE|WORKS|AGENCIES)\b/i;
function looksLikeCompanyLine(s: string): boolean {
const x = String(s || '').trim();
if (!x) return false;
if (RE_COMPANY_NAME_HINT.test(x)) return true;
if (/^[A-Z0-9.&\s\-]{14,}$/.test(x) && !/\s{2,}/.test(x)) return true;
return false;
}
function trimBuyerCapture(raw: string): string {
let s = String(raw || '').replace(/\r/g, '').trim();
s = s.replace(/^[:\-–—\s]+/, '');
const cut = s.split(/\b(?:GSTIN|PAN|Phone|Tel|Email|E-?mail|Mob|Mobile|Address|Qty|Quantity|Part)\b/i)[0];
s = (cut ?? s).trim();
return s.replace(/\s+/g, ' ').trim();
}
function isCsdPoHints(hints?: RuleExtractHints): boolean {
const dt = String(hints?.documentType || '').toUpperCase();
return dt.includes('CSD_PO') || dt.includes('PURCHASE_ORDER');
}
/** Many CSD PO line-items print: 16-digit card/UIN then customer name then plot no / address (Description column). */
const RE_VEHICLE_TOKENS =
/^(ROYAL|ENFIELD|METEOR|CLASSIC|BULLET|HIMALAYAN|INTERCEPTOR|CONTINENTAL|STELLAR|THUNDER|BS-?VI|BSVI|SUPER|VARIANT|MODEL|CC|HP|ABS|QTY|HSN)$/i;
function isPlausibleHumanNameFromPoDescription(s: string): boolean {
const x = String(s || '')
.replace(/\s+/g, ' ')
.trim();
if (x.length < 3 || x.length > 72) return false;
const parts = x.split(/\s+/).filter(Boolean);
if (parts.length < 1 || parts.length > 6) return false;
if (looksLikeCompanyLine(x)) return false;
for (const p of parts) {
if (RE_VEHICLE_TOKENS.test(p)) return false;
}
return parts.some((p) => /^[A-Za-z\u0900-\u097F]{2,}$/.test(p));
}
/**
* Pattern: `5312423002619089 KALAIYARASAN K 71` — 16 digits (optional spaces in groups of 4),
* then name tokens, then often a short plot/house number or newline/address.
*/
function extractCsdPoNameInDescriptionColumn(body: string): string | null {
const norm = body.replace(/\r\n/g, '\n').replace(/\u00a0/g, ' ');
const digitRes: RegExp[] = [/\b\d{4}\s+\d{4}\s+\d{4}\s+\d{4}\b/g, /\b\d{16}\b/g];
const seenAt = new Set<number>();
for (const re of digitRes) {
re.lastIndex = 0;
let dm: RegExpExecArray | null;
while ((dm = re.exec(norm)) !== null) {
const compact = dm[0].replace(/\s/g, '');
if (compact.length !== 16 || !/^\d{16}$/.test(compact)) continue;
if (seenAt.has(dm.index)) continue;
seenAt.add(dm.index);
const tail = norm.slice(dm.index + dm[0].length).replace(/^\s+/, '');
let nm = tail.match(
/^([A-Za-z\u0900-\u097F]+(?:\s+[A-Za-z\u0900-\u097F]+){0,5})(?=\s+\d{1,4}\b|\s*\n|\s*$)/i
);
if (!nm?.[1]) {
const loose = tail.match(/^([A-Za-z\u0900-\u097F]{2,25})\b/i);
if (loose?.[1] && isPlausibleHumanNameFromPoDescription(loose[1])) nm = loose;
}
if (!nm?.[1]) continue;
const candidate = nm[1].replace(/\s+/g, ' ').trim();
if (isPlausibleHumanNameFromPoDescription(candidate)) {
return candidate;
}
}
}
return null;
}
/**
* CSD / defence-style POs usually put the customer under Sold To / Bill To / card holder,
* not under the first "Name:" (often dealer contact).
*/
function extractCsdPoBuyerFromLabels(body: string): string | null {
const norm = body.replace(/\r\n/g, '\n');
const patterns: RegExp[] = [
/(?:^|\n)\s*Sold\s*To\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
/(?:^|\n)\s*Bill\s*To\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
/(?:^|\n)\s*Ship\s*To\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
/(?:^|\n)\s*Consignee\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
/(?:^|\n)\s*(?:Buyer|Purchaser)\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
/(?:^|\n)\s*Customer\s*(?:Name|Details)?\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
/(?:^|\n)\s*CSD\s*Card(?:\s*Holder)?\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
/(?:^|\n)\s*Card\s*Holder(?:\s*Name)?\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
/(?:^|\n)\s*Beneficiary\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
/(?:^|\n)\s*Name\s*of\s*(?:the\s*)?(?:Purchaser|Buyer|Customer)\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
/(?:^|\n)\s*(?:Ordered|Order)\s*(?:By|Placed\s*By)\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i
];
for (const re of patterns) {
const m = norm.match(re);
if (!m?.[1]) continue;
const line = trimBuyerCapture(m[1]);
if (line.length < 2 || line.length > 100) continue;
if (/^(page|date|amount|total|ref|subject)\b/i.test(line)) continue;
return line;
}
return null;
}
export class CpcRuleExtractService {
/**
* If Vertex returned a supplier-style string but OCR shows a clear buyer line, prefer the buyer line.
*/
static refineCsdPoCustomerName(ocrText: string, customerName: unknown): string | null {
const cur = String(customerName ?? '').trim();
const text = String(ocrText || '');
const fromDesc = extractCsdPoNameInDescriptionColumn(text);
const fromLabels = extractCsdPoBuyerFromLabels(text);
const buyer = fromDesc || fromLabels;
if (!buyer) return cur.length >= 2 ? cur : null;
if (!cur) return buyer;
if (looksLikeCompanyLine(cur) && !looksLikeCompanyLine(buyer)) return buyer;
return cur;
}
static extractWithRules(ocrText: string, hints?: RuleExtractHints) {
const t = String(ocrText || "");
const msd = hints?.msdPayload || {};
const isCsdPo = isCsdPoHints(hints);
// Matches 12 digit Aadhaar (with optional spaces)
const aadhaarMatch = t.match(/\b\d{4}\s?\d{4}\s?\d{4}\b/);
// Matches currency patterns
const invoiceMatch = t.match(/(?:₹|Rs\.?|INR)\s?[\d,]+(?:\.\d{1,2})?/i);
// Matches common order/auth patterns
const orderMatch = t.match(/\b(?:PO|ORDER|AUTH|AUTHORIZATION)\s*[:\-]?\s*([A-Z0-9\-\/]{4,})/i);
// Matches "Name: [Value]" / "Authorised Person" / applicant-style labels
const nameMatch = t.match(/\bName\s*[:\-]\s*([A-Za-z][A-Za-z0-9\s.'-]{2,79})/i);
const authPersonMatch = t.match(
/\b(?:authorized|authorised)\s+person\s*[:\-]\s*([A-Za-z][A-Za-z0-9\s.'-]{2,79})/i
);
const applicantMatch = t.match(
/\b(?:applicant|holder|customer|borrower|dealer)\s*[:\-]\s*([A-Za-z][A-Za-z0-9\s.'-]{2,79})/i
);
let displayNameRaw = isCsdPo
? extractCsdPoNameInDescriptionColumn(t) || extractCsdPoBuyerFromLabels(t) || ''
: '';
if (!displayNameRaw) {
displayNameRaw = (authPersonMatch?.[1] || nameMatch?.[1] || applicantMatch?.[1] || '').trim();
}
// MSD-guided: name often appears in body exactly as user typed (no label) — same idea as manual compare in CPC-CSD UI flow
if (!displayNameRaw) {
const fromAuth = msd.customer_name ?? msd.authorized_person_name ?? msd.name;
const hint = String(fromAuth ?? '').trim();
if (hint) {
const minFuzzy = hint.length <= 10 ? 40 : 52;
displayNameRaw =
matchMsdNameInBody(t, hint) ||
findMsdNameTokenInOcr(t, hint) ||
pickNameLineByMsd(t, hint, minFuzzy) ||
'';
}
}
// Title / ALL CAPS line fallback — include short single names (e.g. "Arjun") skipped by older rules
if (!displayNameRaw) {
const lines = t.split(/\r?\n/).map((l) => l.trim()).filter(Boolean);
const noiseLine = /^(qty|ref|date|page|gst|hsn|po|no|id|total|amount|index|desc|sl)$/i;
for (const line of lines) {
if (line.length < 3 || line.length > 80) continue;
if (noiseLine.test(line)) continue;
if (/^(ref|date|subject|to|from|dear|page|annex|authority|letter|royal|enfield|cpc|csd)\b/i.test(line)) {
continue;
}
if (isCsdPo && looksLikeCompanyLine(line)) {
continue;
}
const words = line.split(/\s+/).filter(Boolean);
const singleName =
words.length === 1 &&
/^[A-Za-z\u0900-\u097F]{2,25}$/.test(words[0]) &&
!RE_VEHICLE_TOKENS.test(words[0]) &&
!looksLikeCompanyLine(words[0]);
const multiAllCaps =
/^[A-Z][A-Z0-9\s.'-]{4,70}$/.test(line) && words.length >= 2;
if (singleName || multiAllCaps) {
displayNameRaw = line;
break;
}
const titleCaseName =
words.length >= 1 &&
words.length <= 4 &&
words.every((w) => /^[A-Za-z\u0900-\u097F]{2,}$/.test(w)) &&
!words.some((w) => RE_VEHICLE_TOKENS.test(w)) &&
line[0] === line[0].toUpperCase() &&
/[a-z\u0900-\u097F]/.test(line) &&
!looksLikeCompanyLine(line);
if (titleCaseName && line.length <= 48) {
displayNameRaw = line;
break;
}
}
}
let displayName = displayNameRaw.length >= 2 ? displayNameRaw.replace(/\s+/g, ' ').trim() : null;
if (isCsdPo && displayName) {
displayName = CpcRuleExtractService.refineCsdPoCustomerName(t, displayName) ?? displayName;
}
if (displayName) {
const n = normalizePersonNameExtract(displayName);
if (n) displayName = n;
}
// PAN (Indian format) + MSD hint (PDF may lack strict word boundaries)
let panFromRegex = t.match(/\b([A-Z]{5}[0-9]{4}[A-Z])\b/i);
let panVal = panFromRegex ? String(panFromRegex[1]).toUpperCase() : null;
if (!panVal && msd.pan_number != null) {
panVal = panFromMsdHint(t, msd.pan_number);
}
// Numeric amount for range matching against MSD invoice_value
const amountDigits = invoiceMatch
? String(invoiceMatch[0]).replace(/[^\d.]/g, '').replace(/^\.+|\.+$/g, '')
: null;
let invoiceValueNormalized =
amountDigits && amountDigits.length ? amountDigits : null;
if (!invoiceValueNormalized) {
invoiceValueNormalized =
invoiceDigitsFromMsdHint(t, msd.po_amount) ||
invoiceDigitsFromMsdHint(t, msd.letter_amount) ||
invoiceDigitsFromMsdHint(t, msd.invoice_value);
}
const stampPresent = /(stamp|seal|authorized signatory|signature)/i.test(t);
const govtStampPresent = /(govt\.?\s*stamp|government\s*seal|govt\.?\s*signatory|official\s*stamp|authorized\s*signatory)/i.test(t) || stampPresent;
const stampYesNo = govtStampPresent ? 'yes' : 'no';
const poOrOrder = orderMatch ? orderMatch[1].trim() : null;
const aadhaarDigits = aadhaarMatch ? aadhaarMatch[0].replace(/\s/g, '').trim() : null;
return {
extracted_fields: {
authorized_person_name: displayName,
customer_name: displayName,
pan_number: panVal,
order_or_authorisation_number: poOrOrder,
po_number: poOrOrder,
order_or_auth_number: poOrOrder,
invoice_value: invoiceValueNormalized,
po_amount: invoiceValueNormalized,
letter_amount: invoiceValueNormalized,
aadhaar_number: aadhaarDigits,
aadhar_number: aadhaarDigits,
stamp_or_signatory_present: stampPresent,
stamp_sign_present: stampPresent,
govt_signatory_and_stamp_present: stampYesNo,
signature_and_stamp: stampYesNo
},
field_confidence: {
authorized_person_name: displayName ? 0.65 : 0.2,
customer_name: displayName ? 0.65 : 0.2,
pan_number: panVal ? 0.85 : 0.2,
order_or_authorisation_number: orderMatch ? 0.7 : 0.2,
po_number: orderMatch ? 0.7 : 0.2,
order_or_auth_number: orderMatch ? 0.7 : 0.2,
invoice_value: invoiceValueNormalized ? 0.7 : 0.2,
po_amount: invoiceValueNormalized ? 0.7 : 0.2,
letter_amount: invoiceValueNormalized ? 0.7 : 0.2,
aadhaar_number: aadhaarMatch ? 0.85 : 0.2,
aadhar_number: aadhaarMatch ? 0.85 : 0.2,
stamp_or_signatory_present: stampPresent ? 0.55 : 0.3,
stamp_sign_present: stampPresent ? 0.55 : 0.3,
govt_signatory_and_stamp_present: govtStampPresent ? 0.55 : 0.3,
signature_and_stamp: govtStampPresent ? 0.55 : 0.3
}
};
}
}