376 lines
17 KiB
TypeScript
376 lines
17 KiB
TypeScript
import { calculateMatch, normalizePersonNameExtract } from './utils';
|
|
|
|
export type RuleExtractHints = {
|
|
/** MSD fields typed in UI — used to find the same text inside the PDF (no "Name:" label needed). */
|
|
msdPayload?: Record<string, unknown>;
|
|
/** When `CSD_PO`, prefer buyer/beneficiary lines (Sold To, Bill To, …) over the first generic `Name:` (often supplier). */
|
|
documentType?: string;
|
|
};
|
|
|
|
/**
|
|
* Regex-based extraction logic for CPC-CSD documents.
|
|
* Provides a lightweight alternative to Gemini for common patterns.
|
|
* Field names align with MSD payloads from the CPC dashboard (e.g. authority_letter).
|
|
*/
|
|
function escapeRegExp(s: string): string {
|
|
return s.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
|
}
|
|
|
|
/** If MSD name appears verbatim (spacing flexible) in PDF text, return the matched span. */
|
|
function matchMsdNameInBody(body: string, expected: string): string | null {
|
|
const e = String(expected || '').trim();
|
|
if (e.length < 2) return null;
|
|
const flex = escapeRegExp(e).replace(/\s+/g, '\\s+');
|
|
const m = body.match(new RegExp(flex, 'i'));
|
|
return m ? m[0].replace(/\s+/g, ' ').trim() : null;
|
|
}
|
|
|
|
/**
|
|
* Same word as MSD on a line with other text (table cells, "Customer Arjun …") — strict substring match often fails.
|
|
*/
|
|
function findMsdNameTokenInOcr(body: string, expected: string): string | null {
|
|
const h = String(expected || '').trim();
|
|
if (h.length < 2 || !body.trim()) return null;
|
|
const hl = h.toLowerCase();
|
|
const noise = /^(qty|ref|date|page|gst|hsn|po|no|id|by|to|of|in|at|sl|sr|index|desc|amount|total)$/i;
|
|
const lines = body.split(/\r?\n/).map((l) => l.trim()).filter((l) => l.length > 0);
|
|
for (const line of lines) {
|
|
if (line.length > 160) continue;
|
|
if (line.toLowerCase() === hl) return line;
|
|
const parts = line.split(/[\s,;:|/<>()[\]]+/).filter(Boolean);
|
|
for (const raw of parts) {
|
|
const p = raw.replace(/^[^A-Za-z\u0900-\u097F0-9]+|[^A-Za-z\u0900-\u097F0-9]+$/g, '');
|
|
if (!p || p.length < 2 || noise.test(p)) continue;
|
|
if (p.toLowerCase() === hl) return p;
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/** Pick a short line whose fuzzy score vs MSD is high (authority letters often put name on its own line). */
|
|
function pickNameLineByMsd(body: string, expected: string, minScore = 52): string | null {
|
|
const exp = String(expected || '').trim();
|
|
if (exp.length < 2 || !body.trim()) return null;
|
|
let best: { line: string; score: number } | null = null;
|
|
const lines = body.split(/\r?\n/).map((l) => l.trim()).filter((l) => l.length > 2 && l.length < 120);
|
|
for (const line of lines) {
|
|
if (/^(page|ref|no\.?|date|subject|to|from|dear|sir|madam|annex|schedule|authority|letter|royal|enfield|\d+\s*\/\s*\d+)/i.test(line)) {
|
|
continue;
|
|
}
|
|
const s = calculateMatch(exp, line, 'authorized_person_name');
|
|
if (s >= minScore && (!best || s > best.score)) {
|
|
best = { line, score: s };
|
|
}
|
|
}
|
|
return best?.line ?? null;
|
|
}
|
|
|
|
function normalizePan(s: string): string | null {
|
|
const p = String(s || '')
|
|
.toUpperCase()
|
|
.replace(/\s/g, '');
|
|
return /^[A-Z]{5}[0-9]{4}[A-Z]$/.test(p) ? p : null;
|
|
}
|
|
|
|
/** If MSD PAN appears in PDF text, return canonical PAN (OCR may split with spaces). */
|
|
function panFromMsdHint(body: string, msdPan: unknown): string | null {
|
|
const p = normalizePan(String(msdPan ?? ''));
|
|
if (!p || !body) return null;
|
|
const compact = body.toUpperCase().replace(/[\s-]/g, '');
|
|
return compact.includes(p) ? p : null;
|
|
}
|
|
|
|
/** If MSD amount digits appear in body, return normalized digit string for range match. */
|
|
function invoiceDigitsFromMsdHint(body: string, msdAmt: unknown): string | null {
|
|
const d = String(msdAmt ?? '').replace(/[^\d.]/g, '');
|
|
if (!d || d.length < 1) return null;
|
|
const intPart = d.split('.')[0];
|
|
if (intPart.length >= 2 && body.replace(/[^\d]/g, '').includes(intPart)) {
|
|
return d;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/** Supplier / letterhead lines — not the CSD customer individual name. */
|
|
const RE_COMPANY_NAME_HINT =
|
|
/\b(LIMITED|LTD\.?|L\.?\s*L\.?\s*P\.?|PVT\.?\s*LTD|PRIVATE\s+LIMITED|PVT|PTE|INC\.?|CORP|CORPORATION|INDIA\s+LTD|MOTORS?|AUTOMOBILES?|DEALERS?|ENTERPRISES?|SALES\s*(?:&|AND)?\s*SERVICE|WORKS|AGENCIES)\b/i;
|
|
|
|
function looksLikeCompanyLine(s: string): boolean {
|
|
const x = String(s || '').trim();
|
|
if (!x) return false;
|
|
if (RE_COMPANY_NAME_HINT.test(x)) return true;
|
|
if (/^[A-Z0-9.&\s\-]{14,}$/.test(x) && !/\s{2,}/.test(x)) return true;
|
|
return false;
|
|
}
|
|
|
|
function trimBuyerCapture(raw: string): string {
|
|
let s = String(raw || '').replace(/\r/g, '').trim();
|
|
s = s.replace(/^[:\-–—\s]+/, '');
|
|
const cut = s.split(/\b(?:GSTIN|PAN|Phone|Tel|Email|E-?mail|Mob|Mobile|Address|Qty|Quantity|Part)\b/i)[0];
|
|
s = (cut ?? s).trim();
|
|
return s.replace(/\s+/g, ' ').trim();
|
|
}
|
|
|
|
function isCsdPoHints(hints?: RuleExtractHints): boolean {
|
|
const dt = String(hints?.documentType || '').toUpperCase();
|
|
return dt.includes('CSD_PO') || dt.includes('PURCHASE_ORDER');
|
|
}
|
|
|
|
/** Many CSD PO line-items print: 16-digit card/UIN then customer name then plot no / address (Description column). */
|
|
const RE_VEHICLE_TOKENS =
|
|
/^(ROYAL|ENFIELD|METEOR|CLASSIC|BULLET|HIMALAYAN|INTERCEPTOR|CONTINENTAL|STELLAR|THUNDER|BS-?VI|BSVI|SUPER|VARIANT|MODEL|CC|HP|ABS|QTY|HSN)$/i;
|
|
|
|
function isPlausibleHumanNameFromPoDescription(s: string): boolean {
|
|
const x = String(s || '')
|
|
.replace(/\s+/g, ' ')
|
|
.trim();
|
|
if (x.length < 3 || x.length > 72) return false;
|
|
const parts = x.split(/\s+/).filter(Boolean);
|
|
if (parts.length < 1 || parts.length > 6) return false;
|
|
if (looksLikeCompanyLine(x)) return false;
|
|
for (const p of parts) {
|
|
if (RE_VEHICLE_TOKENS.test(p)) return false;
|
|
}
|
|
return parts.some((p) => /^[A-Za-z\u0900-\u097F]{2,}$/.test(p));
|
|
}
|
|
|
|
/**
|
|
* Pattern: `5312423002619089 KALAIYARASAN K 71` — 16 digits (optional spaces in groups of 4),
|
|
* then name tokens, then often a short plot/house number or newline/address.
|
|
*/
|
|
function extractCsdPoNameInDescriptionColumn(body: string): string | null {
|
|
const norm = body.replace(/\r\n/g, '\n').replace(/\u00a0/g, ' ');
|
|
const digitRes: RegExp[] = [/\b\d{4}\s+\d{4}\s+\d{4}\s+\d{4}\b/g, /\b\d{16}\b/g];
|
|
const seenAt = new Set<number>();
|
|
|
|
for (const re of digitRes) {
|
|
re.lastIndex = 0;
|
|
let dm: RegExpExecArray | null;
|
|
while ((dm = re.exec(norm)) !== null) {
|
|
const compact = dm[0].replace(/\s/g, '');
|
|
if (compact.length !== 16 || !/^\d{16}$/.test(compact)) continue;
|
|
if (seenAt.has(dm.index)) continue;
|
|
seenAt.add(dm.index);
|
|
|
|
const tail = norm.slice(dm.index + dm[0].length).replace(/^\s+/, '');
|
|
let nm = tail.match(
|
|
/^([A-Za-z\u0900-\u097F]+(?:\s+[A-Za-z\u0900-\u097F]+){0,5})(?=\s+\d{1,4}\b|\s*\n|\s*$)/i
|
|
);
|
|
if (!nm?.[1]) {
|
|
const loose = tail.match(/^([A-Za-z\u0900-\u097F]{2,25})\b/i);
|
|
if (loose?.[1] && isPlausibleHumanNameFromPoDescription(loose[1])) nm = loose;
|
|
}
|
|
if (!nm?.[1]) continue;
|
|
const candidate = nm[1].replace(/\s+/g, ' ').trim();
|
|
if (isPlausibleHumanNameFromPoDescription(candidate)) {
|
|
return candidate;
|
|
}
|
|
}
|
|
}
|
|
return null;
|
|
}
|
|
|
|
/**
|
|
* CSD / defence-style POs usually put the customer under Sold To / Bill To / card holder,
|
|
* not under the first "Name:" (often dealer contact).
|
|
*/
|
|
function extractCsdPoBuyerFromLabels(body: string): string | null {
|
|
const norm = body.replace(/\r\n/g, '\n');
|
|
const patterns: RegExp[] = [
|
|
/(?:^|\n)\s*Sold\s*To\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
|
|
/(?:^|\n)\s*Bill\s*To\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
|
|
/(?:^|\n)\s*Ship\s*To\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
|
|
/(?:^|\n)\s*Consignee\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
|
|
/(?:^|\n)\s*(?:Buyer|Purchaser)\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
|
|
/(?:^|\n)\s*Customer\s*(?:Name|Details)?\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
|
|
/(?:^|\n)\s*CSD\s*Card(?:\s*Holder)?\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
|
|
/(?:^|\n)\s*Card\s*Holder(?:\s*Name)?\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
|
|
/(?:^|\n)\s*Beneficiary\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
|
|
/(?:^|\n)\s*Name\s*of\s*(?:the\s*)?(?:Purchaser|Buyer|Customer)\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i,
|
|
/(?:^|\n)\s*(?:Ordered|Order)\s*(?:By|Placed\s*By)\s*[:\-]?\s*\n*\s*([^\n\r]{2,120})/i
|
|
];
|
|
for (const re of patterns) {
|
|
const m = norm.match(re);
|
|
if (!m?.[1]) continue;
|
|
const line = trimBuyerCapture(m[1]);
|
|
if (line.length < 2 || line.length > 100) continue;
|
|
if (/^(page|date|amount|total|ref|subject)\b/i.test(line)) continue;
|
|
return line;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
export class CpcRuleExtractService {
|
|
/**
|
|
* If Vertex returned a supplier-style string but OCR shows a clear buyer line, prefer the buyer line.
|
|
*/
|
|
static refineCsdPoCustomerName(ocrText: string, customerName: unknown): string | null {
|
|
const cur = String(customerName ?? '').trim();
|
|
const text = String(ocrText || '');
|
|
const fromDesc = extractCsdPoNameInDescriptionColumn(text);
|
|
const fromLabels = extractCsdPoBuyerFromLabels(text);
|
|
const buyer = fromDesc || fromLabels;
|
|
if (!buyer) return cur.length >= 2 ? cur : null;
|
|
if (!cur) return buyer;
|
|
if (looksLikeCompanyLine(cur) && !looksLikeCompanyLine(buyer)) return buyer;
|
|
return cur;
|
|
}
|
|
|
|
static extractWithRules(ocrText: string, hints?: RuleExtractHints) {
|
|
const t = String(ocrText || "");
|
|
const msd = hints?.msdPayload || {};
|
|
const isCsdPo = isCsdPoHints(hints);
|
|
|
|
// Matches 12 digit Aadhaar (with optional spaces)
|
|
const aadhaarMatch = t.match(/\b\d{4}\s?\d{4}\s?\d{4}\b/);
|
|
|
|
// Matches currency patterns
|
|
const invoiceMatch = t.match(/(?:₹|Rs\.?|INR)\s?[\d,]+(?:\.\d{1,2})?/i);
|
|
|
|
// Matches common order/auth patterns
|
|
const orderMatch = t.match(/\b(?:PO|ORDER|AUTH|AUTHORIZATION)\s*[:\-]?\s*([A-Z0-9\-\/]{4,})/i);
|
|
|
|
// Matches "Name: [Value]" / "Authorised Person" / applicant-style labels
|
|
const nameMatch = t.match(/\bName\s*[:\-]\s*([A-Za-z][A-Za-z0-9\s.'-]{2,79})/i);
|
|
const authPersonMatch = t.match(
|
|
/\b(?:authorized|authorised)\s+person\s*[:\-]\s*([A-Za-z][A-Za-z0-9\s.'-]{2,79})/i
|
|
);
|
|
const applicantMatch = t.match(
|
|
/\b(?:applicant|holder|customer|borrower|dealer)\s*[:\-]\s*([A-Za-z][A-Za-z0-9\s.'-]{2,79})/i
|
|
);
|
|
let displayNameRaw = isCsdPo
|
|
? extractCsdPoNameInDescriptionColumn(t) || extractCsdPoBuyerFromLabels(t) || ''
|
|
: '';
|
|
if (!displayNameRaw) {
|
|
displayNameRaw = (authPersonMatch?.[1] || nameMatch?.[1] || applicantMatch?.[1] || '').trim();
|
|
}
|
|
|
|
// MSD-guided: name often appears in body exactly as user typed (no label) — same idea as manual compare in CPC-CSD UI flow
|
|
if (!displayNameRaw) {
|
|
const fromAuth = msd.customer_name ?? msd.authorized_person_name ?? msd.name;
|
|
const hint = String(fromAuth ?? '').trim();
|
|
if (hint) {
|
|
const minFuzzy = hint.length <= 10 ? 40 : 52;
|
|
displayNameRaw =
|
|
matchMsdNameInBody(t, hint) ||
|
|
findMsdNameTokenInOcr(t, hint) ||
|
|
pickNameLineByMsd(t, hint, minFuzzy) ||
|
|
'';
|
|
}
|
|
}
|
|
|
|
// Title / ALL CAPS line fallback — include short single names (e.g. "Arjun") skipped by older rules
|
|
if (!displayNameRaw) {
|
|
const lines = t.split(/\r?\n/).map((l) => l.trim()).filter(Boolean);
|
|
const noiseLine = /^(qty|ref|date|page|gst|hsn|po|no|id|total|amount|index|desc|sl)$/i;
|
|
for (const line of lines) {
|
|
if (line.length < 3 || line.length > 80) continue;
|
|
if (noiseLine.test(line)) continue;
|
|
if (/^(ref|date|subject|to|from|dear|page|annex|authority|letter|royal|enfield|cpc|csd)\b/i.test(line)) {
|
|
continue;
|
|
}
|
|
if (isCsdPo && looksLikeCompanyLine(line)) {
|
|
continue;
|
|
}
|
|
const words = line.split(/\s+/).filter(Boolean);
|
|
const singleName =
|
|
words.length === 1 &&
|
|
/^[A-Za-z\u0900-\u097F]{2,25}$/.test(words[0]) &&
|
|
!RE_VEHICLE_TOKENS.test(words[0]) &&
|
|
!looksLikeCompanyLine(words[0]);
|
|
const multiAllCaps =
|
|
/^[A-Z][A-Z0-9\s.'-]{4,70}$/.test(line) && words.length >= 2;
|
|
if (singleName || multiAllCaps) {
|
|
displayNameRaw = line;
|
|
break;
|
|
}
|
|
const titleCaseName =
|
|
words.length >= 1 &&
|
|
words.length <= 4 &&
|
|
words.every((w) => /^[A-Za-z\u0900-\u097F]{2,}$/.test(w)) &&
|
|
!words.some((w) => RE_VEHICLE_TOKENS.test(w)) &&
|
|
line[0] === line[0].toUpperCase() &&
|
|
/[a-z\u0900-\u097F]/.test(line) &&
|
|
!looksLikeCompanyLine(line);
|
|
if (titleCaseName && line.length <= 48) {
|
|
displayNameRaw = line;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
let displayName = displayNameRaw.length >= 2 ? displayNameRaw.replace(/\s+/g, ' ').trim() : null;
|
|
if (isCsdPo && displayName) {
|
|
displayName = CpcRuleExtractService.refineCsdPoCustomerName(t, displayName) ?? displayName;
|
|
}
|
|
if (displayName) {
|
|
const n = normalizePersonNameExtract(displayName);
|
|
if (n) displayName = n;
|
|
}
|
|
|
|
// PAN (Indian format) + MSD hint (PDF may lack strict word boundaries)
|
|
let panFromRegex = t.match(/\b([A-Z]{5}[0-9]{4}[A-Z])\b/i);
|
|
let panVal = panFromRegex ? String(panFromRegex[1]).toUpperCase() : null;
|
|
if (!panVal && msd.pan_number != null) {
|
|
panVal = panFromMsdHint(t, msd.pan_number);
|
|
}
|
|
|
|
// Numeric amount for range matching against MSD invoice_value
|
|
const amountDigits = invoiceMatch
|
|
? String(invoiceMatch[0]).replace(/[^\d.]/g, '').replace(/^\.+|\.+$/g, '')
|
|
: null;
|
|
let invoiceValueNormalized =
|
|
amountDigits && amountDigits.length ? amountDigits : null;
|
|
if (!invoiceValueNormalized) {
|
|
invoiceValueNormalized =
|
|
invoiceDigitsFromMsdHint(t, msd.po_amount) ||
|
|
invoiceDigitsFromMsdHint(t, msd.letter_amount) ||
|
|
invoiceDigitsFromMsdHint(t, msd.invoice_value);
|
|
}
|
|
|
|
const stampPresent = /(stamp|seal|authorized signatory|signature)/i.test(t);
|
|
const govtStampPresent = /(govt\.?\s*stamp|government\s*seal|govt\.?\s*signatory|official\s*stamp|authorized\s*signatory)/i.test(t) || stampPresent;
|
|
const stampYesNo = govtStampPresent ? 'yes' : 'no';
|
|
const poOrOrder = orderMatch ? orderMatch[1].trim() : null;
|
|
const aadhaarDigits = aadhaarMatch ? aadhaarMatch[0].replace(/\s/g, '').trim() : null;
|
|
|
|
return {
|
|
extracted_fields: {
|
|
authorized_person_name: displayName,
|
|
customer_name: displayName,
|
|
pan_number: panVal,
|
|
order_or_authorisation_number: poOrOrder,
|
|
po_number: poOrOrder,
|
|
order_or_auth_number: poOrOrder,
|
|
invoice_value: invoiceValueNormalized,
|
|
po_amount: invoiceValueNormalized,
|
|
letter_amount: invoiceValueNormalized,
|
|
aadhaar_number: aadhaarDigits,
|
|
aadhar_number: aadhaarDigits,
|
|
stamp_or_signatory_present: stampPresent,
|
|
stamp_sign_present: stampPresent,
|
|
govt_signatory_and_stamp_present: stampYesNo,
|
|
signature_and_stamp: stampYesNo
|
|
},
|
|
field_confidence: {
|
|
authorized_person_name: displayName ? 0.65 : 0.2,
|
|
customer_name: displayName ? 0.65 : 0.2,
|
|
pan_number: panVal ? 0.85 : 0.2,
|
|
order_or_authorisation_number: orderMatch ? 0.7 : 0.2,
|
|
po_number: orderMatch ? 0.7 : 0.2,
|
|
order_or_auth_number: orderMatch ? 0.7 : 0.2,
|
|
invoice_value: invoiceValueNormalized ? 0.7 : 0.2,
|
|
po_amount: invoiceValueNormalized ? 0.7 : 0.2,
|
|
letter_amount: invoiceValueNormalized ? 0.7 : 0.2,
|
|
aadhaar_number: aadhaarMatch ? 0.85 : 0.2,
|
|
aadhar_number: aadhaarMatch ? 0.85 : 0.2,
|
|
stamp_or_signatory_present: stampPresent ? 0.55 : 0.3,
|
|
stamp_sign_present: stampPresent ? 0.55 : 0.3,
|
|
govt_signatory_and_stamp_present: govtStampPresent ? 0.55 : 0.3,
|
|
signature_and_stamp: govtStampPresent ? 0.55 : 0.3
|
|
}
|
|
};
|
|
}
|
|
}
|
|
|