From ad0ca7f183729eb14e08a56a58069beb43d106c8 Mon Sep 17 00:00:00 2001 From: DaKheera47 Date: Tue, 20 Jan 2026 23:27:57 +0000 Subject: [PATCH] dedupe and tests --- orchestrator/src/server/api/routes/jobs.ts | 15 +- .../src/server/pipeline/orchestrator.ts | 14 +- .../server/pipeline/sponsor-matching.test.ts | 14 ++ .../services/visa-sponsors/index.test.ts | 107 ++++++++++++++ .../server/services/visa-sponsors/index.ts | 135 ++++++++++-------- 5 files changed, 204 insertions(+), 81 deletions(-) create mode 100644 orchestrator/src/server/services/visa-sponsors/index.test.ts diff --git a/orchestrator/src/server/api/routes/jobs.ts b/orchestrator/src/server/api/routes/jobs.ts index f33e5e5..366cbcc 100644 --- a/orchestrator/src/server/api/routes/jobs.ts +++ b/orchestrator/src/server/api/routes/jobs.ts @@ -160,20 +160,7 @@ jobsRouter.post('/:id/check-sponsor', async (req: Request, res: Response) => { minScore: 50, }); - let sponsorMatchScore = 0; - let sponsorMatchNames: string | null = null; - - if (sponsorResults.length > 0) { - const topScore = sponsorResults[0].score; - // Get all 100% matches, or just the top match - const perfectMatches = sponsorResults.filter(r => r.score === 100); - const matchesToReport = perfectMatches.length >= 2 - ? perfectMatches.slice(0, 2) - : [sponsorResults[0]]; - - sponsorMatchScore = topScore; - sponsorMatchNames = JSON.stringify(matchesToReport.map(r => r.sponsor.organisationName)); - } + const { sponsorMatchScore, sponsorMatchNames } = visaSponsors.calculateSponsorMatchSummary(sponsorResults); // Update job with sponsor match info const updatedJob = await jobsRepo.updateJob(job.id, { diff --git a/orchestrator/src/server/pipeline/orchestrator.ts b/orchestrator/src/server/pipeline/orchestrator.ts index bfd605a..dd204c9 100644 --- a/orchestrator/src/server/pipeline/orchestrator.ts +++ b/orchestrator/src/server/pipeline/orchestrator.ts @@ -304,17 +304,9 @@ export async function runPipeline(config: Partial = {}): Promise minScore: 50, }); - if (sponsorResults.length > 0) { - const topScore = sponsorResults[0].score; - // Get all 100% matches, or just the top match - const perfectMatches = sponsorResults.filter(r => r.score === 100); - const matchesToReport = perfectMatches.length >= 2 - ? perfectMatches.slice(0, 2) - : [sponsorResults[0]]; - - sponsorMatchScore = topScore; - sponsorMatchNames = JSON.stringify(matchesToReport.map(r => r.sponsor.organisationName)); - } + const summary = visaSponsors.calculateSponsorMatchSummary(sponsorResults); + sponsorMatchScore = summary.sponsorMatchScore; + sponsorMatchNames = summary.sponsorMatchNames ?? undefined; } // Update score and sponsor match in database diff --git a/orchestrator/src/server/pipeline/sponsor-matching.test.ts b/orchestrator/src/server/pipeline/sponsor-matching.test.ts index d8c8d00..5ef0daa 100644 --- a/orchestrator/src/server/pipeline/sponsor-matching.test.ts +++ b/orchestrator/src/server/pipeline/sponsor-matching.test.ts @@ -11,6 +11,7 @@ import type { Job } from '../../shared/types.js'; // Mock the visa-sponsors module vi.mock('../services/visa-sponsors/index.js', () => ({ searchSponsors: vi.fn(), + calculateSponsorMatchSummary: vi.fn(), })); // Mock the scorer module @@ -115,6 +116,7 @@ const createMockJob = (overrides: Partial = {}): Job => ({ describe('Sponsor Match Calculation', () => { let searchSponsors: ReturnType; + let calculateSponsorMatchSummary: ReturnType; let scoreJobSuitability: ReturnType; let updateJob: ReturnType; let getUnscoredDiscoveredJobs: ReturnType; @@ -129,6 +131,7 @@ describe('Sponsor Match Calculation', () => { const jobsRepo = await import('../repositories/jobs.js'); searchSponsors = visaSponsors.searchSponsors as ReturnType; + calculateSponsorMatchSummary = visaSponsors.calculateSponsorMatchSummary as ReturnType; scoreJobSuitability = scorer.scoreJobSuitability as ReturnType; updateJob = jobsRepo.updateJob as ReturnType; getUnscoredDiscoveredJobs = jobsRepo.getUnscoredDiscoveredJobs as ReturnType; @@ -138,6 +141,17 @@ describe('Sponsor Match Calculation', () => { scoreJobSuitability.mockResolvedValue({ score: 75, reason: 'Good match' }); bulkCreateJobs.mockResolvedValue({ created: 0, skipped: 0 }); updateJob.mockResolvedValue(undefined); + + calculateSponsorMatchSummary.mockImplementation((results: any[]) => { + if (results.length === 0) return { sponsorMatchScore: 0, sponsorMatchNames: null }; + const topScore = results[0].score; + const perfectMatches = results.filter((r: any) => r.score === 100); + const matchesToReport = perfectMatches.length >= 2 ? perfectMatches.slice(0, 2) : [results[0]]; + return { + sponsorMatchScore: topScore, + sponsorMatchNames: JSON.stringify(matchesToReport.map((r: any) => r.sponsor.organisationName)), + }; + }); }); afterEach(() => { diff --git a/orchestrator/src/server/services/visa-sponsors/index.test.ts b/orchestrator/src/server/services/visa-sponsors/index.test.ts new file mode 100644 index 0000000..93272a2 --- /dev/null +++ b/orchestrator/src/server/services/visa-sponsors/index.test.ts @@ -0,0 +1,107 @@ +import { describe, it, expect } from 'vitest'; +import { calculateSponsorMatchSummary } from './index.js'; +import type { VisaSponsorSearchResult } from '../../../shared/types.js'; + +describe('calculateSponsorMatchSummary', () => { + it('should return default values for empty results', () => { + const results: VisaSponsorSearchResult[] = []; + const summary = calculateSponsorMatchSummary(results); + + expect(summary.sponsorMatchScore).toBe(0); + expect(summary.sponsorMatchNames).toBeNull(); + }); + + it('should report the top match when it is not a perfect match', () => { + const results: VisaSponsorSearchResult[] = [ + { + score: 85, + sponsor: { organisationName: 'Tech Corp' } as any, + matchedName: 'tech corp' + }, + { + score: 60, + sponsor: { organisationName: 'Other Ltd' } as any, + matchedName: 'other' + } + ]; + + const summary = calculateSponsorMatchSummary(results); + + expect(summary.sponsorMatchScore).toBe(85); + expect(summary.sponsorMatchNames).toBe(JSON.stringify(['Tech Corp'])); + }); + + it('should report a single perfect match', () => { + const results: VisaSponsorSearchResult[] = [ + { + score: 100, + sponsor: { organisationName: 'Exact Match Ltd' } as any, + matchedName: 'exact match' + }, + { + score: 90, + sponsor: { organisationName: 'Close Match' } as any, + matchedName: 'close' + } + ]; + + const summary = calculateSponsorMatchSummary(results); + + expect(summary.sponsorMatchScore).toBe(100); + expect(summary.sponsorMatchNames).toBe(JSON.stringify(['Exact Match Ltd'])); + }); + + it('should report exactly two 100% matches when two or more exist', () => { + const results: VisaSponsorSearchResult[] = [ + { + score: 100, + sponsor: { organisationName: 'First PerfectMatch' } as any, + matchedName: 'match' + }, + { + score: 100, + sponsor: { organisationName: 'Second PerfectMatch' } as any, + matchedName: 'match' + }, + { + score: 100, + sponsor: { organisationName: 'Third PerfectMatch' } as any, + matchedName: 'match' + }, + { + score: 50, + sponsor: { organisationName: 'Common Co' } as any, + matchedName: 'common' + } + ]; + + const summary = calculateSponsorMatchSummary(results); + + expect(summary.sponsorMatchScore).toBe(100); + const names = JSON.parse(summary.sponsorMatchNames!); + expect(names).toHaveLength(2); + expect(names).toContain('First PerfectMatch'); + expect(names).toContain('Second PerfectMatch'); + expect(names).not.toContain('Third PerfectMatch'); + }); + + it('should only report the single top result if no 100% matches exist', () => { + const results: VisaSponsorSearchResult[] = [ + { + score: 99, + sponsor: { organisationName: 'Almost Perfect' } as any, + matchedName: 'almost' + }, + { + score: 98, + sponsor: { organisationName: 'Second Best' } as any, + matchedName: 'best' + } + ]; + + const summary = calculateSponsorMatchSummary(results); + + expect(summary.sponsorMatchScore).toBe(99); + expect(summary.sponsorMatchNames).toBe(JSON.stringify(['Almost Perfect'])); + }); +}); diff --git a/orchestrator/src/server/services/visa-sponsors/index.ts b/orchestrator/src/server/services/visa-sponsors/index.ts index b01c5f1..963d887 100644 --- a/orchestrator/src/server/services/visa-sponsors/index.ts +++ b/orchestrator/src/server/services/visa-sponsors/index.ts @@ -57,20 +57,20 @@ let updateError: string | null = null; */ export function normalizeCompanyName(name: string): string { let normalized = name.toLowerCase().trim(); - + // Remove common punctuation and special chars normalized = normalized.replace(/[.,'"()[\]{}!?@#$%^&*+=|\\/<>:;`~]/g, ' '); - + // Remove suffixes for (const suffix of COMPANY_SUFFIXES) { // Word boundary matching const regex = new RegExp(`\\b${suffix}\\b`, 'gi'); normalized = normalized.replace(regex, ''); } - + // Collapse whitespace normalized = normalized.replace(/\s+/g, ' ').trim(); - + return normalized; } @@ -81,27 +81,27 @@ export function normalizeCompanyName(name: string): string { export function calculateSimilarity(str1: string, str2: string): number { const s1 = str1.toLowerCase(); const s2 = str2.toLowerCase(); - + if (s1 === s2) return 100; if (s1.length === 0 || s2.length === 0) return 0; - + // Check if one contains the other if (s1.includes(s2) || s2.includes(s1)) { const longerLen = Math.max(s1.length, s2.length); const shorterLen = Math.min(s1.length, s2.length); return Math.round((shorterLen / longerLen) * 100); } - + // Levenshtein distance const matrix: number[][] = []; - + for (let i = 0; i <= s1.length; i++) { matrix[i] = [i]; } for (let j = 0; j <= s2.length; j++) { matrix[0][j] = j; } - + for (let i = 1; i <= s1.length; i++) { for (let j = 1; j <= s2.length; j++) { const cost = s1[i - 1] === s2[j - 1] ? 0 : 1; @@ -112,10 +112,10 @@ export function calculateSimilarity(str1: string, str2: string): number { ); } } - + const distance = matrix[s1.length][s2.length]; const maxLen = Math.max(s1.length, s2.length); - + return Math.round(((maxLen - distance) / maxLen) * 100); } @@ -125,12 +125,12 @@ export function calculateSimilarity(str1: string, str2: string): number { export function parseCsv(content: string): VisaSponsor[] { const lines = content.split('\n'); const sponsors: VisaSponsor[] = []; - + // Skip header for (let i = 1; i < lines.length; i++) { const line = lines[i].trim(); if (!line) continue; - + // Parse CSV with proper quote handling const fields = parseCSVLine(line); if (fields.length >= 5) { @@ -143,7 +143,7 @@ export function parseCsv(content: string): VisaSponsor[] { }); } } - + return sponsors; } @@ -154,11 +154,11 @@ function parseCSVLine(line: string): string[] { const fields: string[] = []; let current = ''; let inQuotes = false; - + for (let i = 0; i < line.length; i++) { const char = line[i]; const nextChar = line[i + 1]; - + if (char === '"' && !inQuotes) { inQuotes = true; } else if (char === '"' && inQuotes) { @@ -176,7 +176,7 @@ function parseCSVLine(line: string): string[] { current += char; } } - + fields.push(current.trim()); return fields; } @@ -186,7 +186,7 @@ function parseCSVLine(line: string): string[] { */ function getCsvFiles(): string[] { if (!fs.existsSync(DATA_DIR)) return []; - + return fs.readdirSync(DATA_DIR) .filter(f => f.endsWith('.csv')) .sort() @@ -245,25 +245,25 @@ function cleanupOldCsvFiles(): void { */ async function extractCsvUrl(): Promise { const pageUrl = 'https://www.gov.uk/government/publications/register-of-licensed-sponsors-workers'; - + console.log('📄 Fetching gov.uk page to find CSV link...'); const response = await fetch(pageUrl); - + if (!response.ok) { throw new Error(`Failed to fetch gov.uk page: ${response.status} ${response.statusText}`); } - + const html = await response.text(); - + // Look for the Worker and Temporary Worker CSV link const csvMatch = html.match( /href="(https:\/\/assets\.publishing\.service\.gov\.uk\/media\/[^"]+Worker_and_Temporary_Worker\.csv)"/ ); - + if (!csvMatch) { throw new Error('Could not find Worker and Temporary Worker CSV link on gov.uk page'); } - + return csvMatch[1]; } @@ -274,52 +274,52 @@ export async function downloadLatestCsv(): Promise<{ success: boolean; message: if (isUpdating) { return { success: false, message: 'Update already in progress' }; } - + isUpdating = true; updateError = null; - + try { // Extract the CSV URL from the page const csvUrl = await extractCsvUrl(); console.log(`📥 Downloading CSV from: ${csvUrl}`); - + const response = await fetch(csvUrl); - + if (!response.ok) { throw new Error(`Failed to download CSV: ${response.status} ${response.statusText}`); } - + const csvContent = await response.text(); - + // Validate CSV has content const sponsors = parseCsv(csvContent); if (sponsors.length === 0) { throw new Error('Downloaded CSV appears to be empty or invalid'); } - + // Generate filename with date const dateStr = new Date().toISOString().split('T')[0]; const filename = `visa_sponsors_${dateStr}.csv`; const filepath = path.join(DATA_DIR, filename); - + // Save the CSV fs.writeFileSync(filepath, csvContent); - + // Update metadata writeMetadata({ lastUpdated: new Date().toISOString(), csvFile: filename, }); - + // Cleanup old files cleanupOldCsvFiles(); - + // Clear cache so next search loads new data sponsorsCache = null; cacheLoadedAt = null; - + console.log(`✅ Downloaded visa sponsor list: ${sponsors.length} sponsors`); - + return { success: true, message: `Successfully downloaded ${sponsors.length} sponsors`, @@ -345,17 +345,17 @@ export function loadSponsors(): VisaSponsor[] { return sponsorsCache; } } - + const metadata = readMetadata(); if (!metadata.csvFile) { return []; } - + const csvPath = path.join(DATA_DIR, metadata.csvFile); if (!fs.existsSync(csvPath)) { return []; } - + try { const content = fs.readFileSync(csvPath, 'utf-8'); sponsorsCache = parseCsv(content); @@ -375,26 +375,26 @@ export function searchSponsors( options: { limit?: number; minScore?: number } = {} ): VisaSponsorSearchResult[] { const { limit = 50, minScore = 30 } = options; - + const sponsors = loadSponsors(); if (sponsors.length === 0 || !query.trim()) { return []; } - + const normalizedQuery = normalizeCompanyName(query); const results: VisaSponsorSearchResult[] = []; const seen = new Set(); // Dedupe by org name - + for (const sponsor of sponsors) { // Skip if we've already seen this org name if (seen.has(sponsor.organisationName)) continue; seen.add(sponsor.organisationName); - + const normalizedSponsor = normalizeCompanyName(sponsor.organisationName); - + // Calculate similarity const score = calculateSimilarity(normalizedQuery, normalizedSponsor); - + if (score >= minScore) { results.push({ sponsor, @@ -403,20 +403,43 @@ export function searchSponsors( }); } } - + // Sort by score descending results.sort((a, b) => b.score - a.score); - + return results.slice(0, limit); } +/** + * Calculate match summary from search results + */ +export function calculateSponsorMatchSummary( + results: VisaSponsorSearchResult[] +): { sponsorMatchScore: number; sponsorMatchNames: string | null } { + if (results.length === 0) { + return { sponsorMatchScore: 0, sponsorMatchNames: null }; + } + + const topScore = results[0].score; + // Get all 100% matches, or just the top match + const perfectMatches = results.filter(r => r.score === 100); + const matchesToReport = perfectMatches.length >= 2 + ? perfectMatches.slice(0, 2) + : [results[0]]; + + return { + sponsorMatchScore: topScore, + sponsorMatchNames: JSON.stringify(matchesToReport.map(r => r.sponsor.organisationName)), + }; +} + /** * Get status of the visa sponsor service */ export function getStatus(): VisaSponsorStatus { const metadata = readMetadata(); const sponsors = loadSponsors(); - + return { lastUpdated: metadata.lastUpdated, csvPath: metadata.csvFile ? path.join(DATA_DIR, metadata.csvFile) : null, @@ -449,12 +472,12 @@ function calculateNextUpdateTime(hour = 2): Date { const now = new Date(); const next = new Date(now); next.setHours(hour, 0, 0, 0); - + // If we've passed the time today, schedule for tomorrow if (next <= now) { next.setDate(next.getDate() + 1); } - + return next; } @@ -472,12 +495,12 @@ function scheduleNextUpdate(hour = 2): void { if (scheduledTimer) { clearTimeout(scheduledTimer); } - + nextScheduledUpdateTime = calculateNextUpdateTime(hour); const delay = nextScheduledUpdateTime.getTime() - Date.now(); - + console.log(`⏰ Next visa sponsor update scheduled for: ${nextScheduledUpdateTime.toISOString()}`); - + scheduledTimer = setTimeout(async () => { console.log('🔄 Running scheduled visa sponsor update...'); await downloadLatestCsv(); @@ -510,7 +533,7 @@ export function stopScheduler(): void { */ export async function initialize(): Promise { const metadata = readMetadata(); - + if (!metadata.csvFile) { console.log('📥 No visa sponsor data found, downloading...'); await downloadLatestCsv(); @@ -518,7 +541,7 @@ export async function initialize(): Promise { const sponsors = loadSponsors(); console.log(`✅ Visa sponsor service initialized with ${sponsors.length} sponsors`); } - + // Start the scheduler for automatic daily updates at 2 AM startScheduler(2); }