dedupe and tests

This commit is contained in:
DaKheera47 2026-01-20 23:27:57 +00:00
parent 6c3bb681d1
commit ad0ca7f183
5 changed files with 204 additions and 81 deletions

View File

@ -160,20 +160,7 @@ jobsRouter.post('/:id/check-sponsor', async (req: Request, res: Response) => {
minScore: 50,
});
let sponsorMatchScore = 0;
let sponsorMatchNames: string | null = null;
if (sponsorResults.length > 0) {
const topScore = sponsorResults[0].score;
// Get all 100% matches, or just the top match
const perfectMatches = sponsorResults.filter(r => r.score === 100);
const matchesToReport = perfectMatches.length >= 2
? perfectMatches.slice(0, 2)
: [sponsorResults[0]];
sponsorMatchScore = topScore;
sponsorMatchNames = JSON.stringify(matchesToReport.map(r => r.sponsor.organisationName));
}
const { sponsorMatchScore, sponsorMatchNames } = visaSponsors.calculateSponsorMatchSummary(sponsorResults);
// Update job with sponsor match info
const updatedJob = await jobsRepo.updateJob(job.id, {

View File

@ -304,17 +304,9 @@ export async function runPipeline(config: Partial<PipelineConfig> = {}): Promise
minScore: 50,
});
if (sponsorResults.length > 0) {
const topScore = sponsorResults[0].score;
// Get all 100% matches, or just the top match
const perfectMatches = sponsorResults.filter(r => r.score === 100);
const matchesToReport = perfectMatches.length >= 2
? perfectMatches.slice(0, 2)
: [sponsorResults[0]];
sponsorMatchScore = topScore;
sponsorMatchNames = JSON.stringify(matchesToReport.map(r => r.sponsor.organisationName));
}
const summary = visaSponsors.calculateSponsorMatchSummary(sponsorResults);
sponsorMatchScore = summary.sponsorMatchScore;
sponsorMatchNames = summary.sponsorMatchNames ?? undefined;
}
// Update score and sponsor match in database

View File

@ -11,6 +11,7 @@ import type { Job } from '../../shared/types.js';
// Mock the visa-sponsors module
vi.mock('../services/visa-sponsors/index.js', () => ({
searchSponsors: vi.fn(),
calculateSponsorMatchSummary: vi.fn(),
}));
// Mock the scorer module
@ -115,6 +116,7 @@ const createMockJob = (overrides: Partial<Job> = {}): Job => ({
describe('Sponsor Match Calculation', () => {
let searchSponsors: ReturnType<typeof vi.fn>;
let calculateSponsorMatchSummary: ReturnType<typeof vi.fn>;
let scoreJobSuitability: ReturnType<typeof vi.fn>;
let updateJob: ReturnType<typeof vi.fn>;
let getUnscoredDiscoveredJobs: ReturnType<typeof vi.fn>;
@ -129,6 +131,7 @@ describe('Sponsor Match Calculation', () => {
const jobsRepo = await import('../repositories/jobs.js');
searchSponsors = visaSponsors.searchSponsors as ReturnType<typeof vi.fn>;
calculateSponsorMatchSummary = visaSponsors.calculateSponsorMatchSummary as ReturnType<typeof vi.fn>;
scoreJobSuitability = scorer.scoreJobSuitability as ReturnType<typeof vi.fn>;
updateJob = jobsRepo.updateJob as ReturnType<typeof vi.fn>;
getUnscoredDiscoveredJobs = jobsRepo.getUnscoredDiscoveredJobs as ReturnType<typeof vi.fn>;
@ -138,6 +141,17 @@ describe('Sponsor Match Calculation', () => {
scoreJobSuitability.mockResolvedValue({ score: 75, reason: 'Good match' });
bulkCreateJobs.mockResolvedValue({ created: 0, skipped: 0 });
updateJob.mockResolvedValue(undefined);
calculateSponsorMatchSummary.mockImplementation((results: any[]) => {
if (results.length === 0) return { sponsorMatchScore: 0, sponsorMatchNames: null };
const topScore = results[0].score;
const perfectMatches = results.filter((r: any) => r.score === 100);
const matchesToReport = perfectMatches.length >= 2 ? perfectMatches.slice(0, 2) : [results[0]];
return {
sponsorMatchScore: topScore,
sponsorMatchNames: JSON.stringify(matchesToReport.map((r: any) => r.sponsor.organisationName)),
};
});
});
afterEach(() => {

View File

@ -0,0 +1,107 @@
import { describe, it, expect } from 'vitest';
import { calculateSponsorMatchSummary } from './index.js';
import type { VisaSponsorSearchResult } from '../../../shared/types.js';
describe('calculateSponsorMatchSummary', () => {
it('should return default values for empty results', () => {
const results: VisaSponsorSearchResult[] = [];
const summary = calculateSponsorMatchSummary(results);
expect(summary.sponsorMatchScore).toBe(0);
expect(summary.sponsorMatchNames).toBeNull();
});
it('should report the top match when it is not a perfect match', () => {
const results: VisaSponsorSearchResult[] = [
{
score: 85,
sponsor: { organisationName: 'Tech Corp' } as any,
matchedName: 'tech corp'
},
{
score: 60,
sponsor: { organisationName: 'Other Ltd' } as any,
matchedName: 'other'
}
];
const summary = calculateSponsorMatchSummary(results);
expect(summary.sponsorMatchScore).toBe(85);
expect(summary.sponsorMatchNames).toBe(JSON.stringify(['Tech Corp']));
});
it('should report a single perfect match', () => {
const results: VisaSponsorSearchResult[] = [
{
score: 100,
sponsor: { organisationName: 'Exact Match Ltd' } as any,
matchedName: 'exact match'
},
{
score: 90,
sponsor: { organisationName: 'Close Match' } as any,
matchedName: 'close'
}
];
const summary = calculateSponsorMatchSummary(results);
expect(summary.sponsorMatchScore).toBe(100);
expect(summary.sponsorMatchNames).toBe(JSON.stringify(['Exact Match Ltd']));
});
it('should report exactly two 100% matches when two or more exist', () => {
const results: VisaSponsorSearchResult[] = [
{
score: 100,
sponsor: { organisationName: 'First PerfectMatch' } as any,
matchedName: 'match'
},
{
score: 100,
sponsor: { organisationName: 'Second PerfectMatch' } as any,
matchedName: 'match'
},
{
score: 100,
sponsor: { organisationName: 'Third PerfectMatch' } as any,
matchedName: 'match'
},
{
score: 50,
sponsor: { organisationName: 'Common Co' } as any,
matchedName: 'common'
}
];
const summary = calculateSponsorMatchSummary(results);
expect(summary.sponsorMatchScore).toBe(100);
const names = JSON.parse(summary.sponsorMatchNames!);
expect(names).toHaveLength(2);
expect(names).toContain('First PerfectMatch');
expect(names).toContain('Second PerfectMatch');
expect(names).not.toContain('Third PerfectMatch');
});
it('should only report the single top result if no 100% matches exist', () => {
const results: VisaSponsorSearchResult[] = [
{
score: 99,
sponsor: { organisationName: 'Almost Perfect' } as any,
matchedName: 'almost'
},
{
score: 98,
sponsor: { organisationName: 'Second Best' } as any,
matchedName: 'best'
}
];
const summary = calculateSponsorMatchSummary(results);
expect(summary.sponsorMatchScore).toBe(99);
expect(summary.sponsorMatchNames).toBe(JSON.stringify(['Almost Perfect']));
});
});

View File

@ -57,20 +57,20 @@ let updateError: string | null = null;
*/
export function normalizeCompanyName(name: string): string {
let normalized = name.toLowerCase().trim();
// Remove common punctuation and special chars
normalized = normalized.replace(/[.,'"()[\]{}!?@#$%^&*+=|\\/<>:;`~]/g, ' ');
// Remove suffixes
for (const suffix of COMPANY_SUFFIXES) {
// Word boundary matching
const regex = new RegExp(`\\b${suffix}\\b`, 'gi');
normalized = normalized.replace(regex, '');
}
// Collapse whitespace
normalized = normalized.replace(/\s+/g, ' ').trim();
return normalized;
}
@ -81,27 +81,27 @@ export function normalizeCompanyName(name: string): string {
export function calculateSimilarity(str1: string, str2: string): number {
const s1 = str1.toLowerCase();
const s2 = str2.toLowerCase();
if (s1 === s2) return 100;
if (s1.length === 0 || s2.length === 0) return 0;
// Check if one contains the other
if (s1.includes(s2) || s2.includes(s1)) {
const longerLen = Math.max(s1.length, s2.length);
const shorterLen = Math.min(s1.length, s2.length);
return Math.round((shorterLen / longerLen) * 100);
}
// Levenshtein distance
const matrix: number[][] = [];
for (let i = 0; i <= s1.length; i++) {
matrix[i] = [i];
}
for (let j = 0; j <= s2.length; j++) {
matrix[0][j] = j;
}
for (let i = 1; i <= s1.length; i++) {
for (let j = 1; j <= s2.length; j++) {
const cost = s1[i - 1] === s2[j - 1] ? 0 : 1;
@ -112,10 +112,10 @@ export function calculateSimilarity(str1: string, str2: string): number {
);
}
}
const distance = matrix[s1.length][s2.length];
const maxLen = Math.max(s1.length, s2.length);
return Math.round(((maxLen - distance) / maxLen) * 100);
}
@ -125,12 +125,12 @@ export function calculateSimilarity(str1: string, str2: string): number {
export function parseCsv(content: string): VisaSponsor[] {
const lines = content.split('\n');
const sponsors: VisaSponsor[] = [];
// Skip header
for (let i = 1; i < lines.length; i++) {
const line = lines[i].trim();
if (!line) continue;
// Parse CSV with proper quote handling
const fields = parseCSVLine(line);
if (fields.length >= 5) {
@ -143,7 +143,7 @@ export function parseCsv(content: string): VisaSponsor[] {
});
}
}
return sponsors;
}
@ -154,11 +154,11 @@ function parseCSVLine(line: string): string[] {
const fields: string[] = [];
let current = '';
let inQuotes = false;
for (let i = 0; i < line.length; i++) {
const char = line[i];
const nextChar = line[i + 1];
if (char === '"' && !inQuotes) {
inQuotes = true;
} else if (char === '"' && inQuotes) {
@ -176,7 +176,7 @@ function parseCSVLine(line: string): string[] {
current += char;
}
}
fields.push(current.trim());
return fields;
}
@ -186,7 +186,7 @@ function parseCSVLine(line: string): string[] {
*/
function getCsvFiles(): string[] {
if (!fs.existsSync(DATA_DIR)) return [];
return fs.readdirSync(DATA_DIR)
.filter(f => f.endsWith('.csv'))
.sort()
@ -245,25 +245,25 @@ function cleanupOldCsvFiles(): void {
*/
async function extractCsvUrl(): Promise<string> {
const pageUrl = 'https://www.gov.uk/government/publications/register-of-licensed-sponsors-workers';
console.log('📄 Fetching gov.uk page to find CSV link...');
const response = await fetch(pageUrl);
if (!response.ok) {
throw new Error(`Failed to fetch gov.uk page: ${response.status} ${response.statusText}`);
}
const html = await response.text();
// Look for the Worker and Temporary Worker CSV link
const csvMatch = html.match(
/href="(https:\/\/assets\.publishing\.service\.gov\.uk\/media\/[^"]+Worker_and_Temporary_Worker\.csv)"/
);
if (!csvMatch) {
throw new Error('Could not find Worker and Temporary Worker CSV link on gov.uk page');
}
return csvMatch[1];
}
@ -274,52 +274,52 @@ export async function downloadLatestCsv(): Promise<{ success: boolean; message:
if (isUpdating) {
return { success: false, message: 'Update already in progress' };
}
isUpdating = true;
updateError = null;
try {
// Extract the CSV URL from the page
const csvUrl = await extractCsvUrl();
console.log(`📥 Downloading CSV from: ${csvUrl}`);
const response = await fetch(csvUrl);
if (!response.ok) {
throw new Error(`Failed to download CSV: ${response.status} ${response.statusText}`);
}
const csvContent = await response.text();
// Validate CSV has content
const sponsors = parseCsv(csvContent);
if (sponsors.length === 0) {
throw new Error('Downloaded CSV appears to be empty or invalid');
}
// Generate filename with date
const dateStr = new Date().toISOString().split('T')[0];
const filename = `visa_sponsors_${dateStr}.csv`;
const filepath = path.join(DATA_DIR, filename);
// Save the CSV
fs.writeFileSync(filepath, csvContent);
// Update metadata
writeMetadata({
lastUpdated: new Date().toISOString(),
csvFile: filename,
});
// Cleanup old files
cleanupOldCsvFiles();
// Clear cache so next search loads new data
sponsorsCache = null;
cacheLoadedAt = null;
console.log(`✅ Downloaded visa sponsor list: ${sponsors.length} sponsors`);
return {
success: true,
message: `Successfully downloaded ${sponsors.length} sponsors`,
@ -345,17 +345,17 @@ export function loadSponsors(): VisaSponsor[] {
return sponsorsCache;
}
}
const metadata = readMetadata();
if (!metadata.csvFile) {
return [];
}
const csvPath = path.join(DATA_DIR, metadata.csvFile);
if (!fs.existsSync(csvPath)) {
return [];
}
try {
const content = fs.readFileSync(csvPath, 'utf-8');
sponsorsCache = parseCsv(content);
@ -375,26 +375,26 @@ export function searchSponsors(
options: { limit?: number; minScore?: number } = {}
): VisaSponsorSearchResult[] {
const { limit = 50, minScore = 30 } = options;
const sponsors = loadSponsors();
if (sponsors.length === 0 || !query.trim()) {
return [];
}
const normalizedQuery = normalizeCompanyName(query);
const results: VisaSponsorSearchResult[] = [];
const seen = new Set<string>(); // Dedupe by org name
for (const sponsor of sponsors) {
// Skip if we've already seen this org name
if (seen.has(sponsor.organisationName)) continue;
seen.add(sponsor.organisationName);
const normalizedSponsor = normalizeCompanyName(sponsor.organisationName);
// Calculate similarity
const score = calculateSimilarity(normalizedQuery, normalizedSponsor);
if (score >= minScore) {
results.push({
sponsor,
@ -403,20 +403,43 @@ export function searchSponsors(
});
}
}
// Sort by score descending
results.sort((a, b) => b.score - a.score);
return results.slice(0, limit);
}
/**
* Calculate match summary from search results
*/
export function calculateSponsorMatchSummary(
results: VisaSponsorSearchResult[]
): { sponsorMatchScore: number; sponsorMatchNames: string | null } {
if (results.length === 0) {
return { sponsorMatchScore: 0, sponsorMatchNames: null };
}
const topScore = results[0].score;
// Get all 100% matches, or just the top match
const perfectMatches = results.filter(r => r.score === 100);
const matchesToReport = perfectMatches.length >= 2
? perfectMatches.slice(0, 2)
: [results[0]];
return {
sponsorMatchScore: topScore,
sponsorMatchNames: JSON.stringify(matchesToReport.map(r => r.sponsor.organisationName)),
};
}
/**
* Get status of the visa sponsor service
*/
export function getStatus(): VisaSponsorStatus {
const metadata = readMetadata();
const sponsors = loadSponsors();
return {
lastUpdated: metadata.lastUpdated,
csvPath: metadata.csvFile ? path.join(DATA_DIR, metadata.csvFile) : null,
@ -449,12 +472,12 @@ function calculateNextUpdateTime(hour = 2): Date {
const now = new Date();
const next = new Date(now);
next.setHours(hour, 0, 0, 0);
// If we've passed the time today, schedule for tomorrow
if (next <= now) {
next.setDate(next.getDate() + 1);
}
return next;
}
@ -472,12 +495,12 @@ function scheduleNextUpdate(hour = 2): void {
if (scheduledTimer) {
clearTimeout(scheduledTimer);
}
nextScheduledUpdateTime = calculateNextUpdateTime(hour);
const delay = nextScheduledUpdateTime.getTime() - Date.now();
console.log(`⏰ Next visa sponsor update scheduled for: ${nextScheduledUpdateTime.toISOString()}`);
scheduledTimer = setTimeout(async () => {
console.log('🔄 Running scheduled visa sponsor update...');
await downloadLatestCsv();
@ -510,7 +533,7 @@ export function stopScheduler(): void {
*/
export async function initialize(): Promise<void> {
const metadata = readMetadata();
if (!metadata.csvFile) {
console.log('📥 No visa sponsor data found, downloading...');
await downloadLatestCsv();
@ -518,7 +541,7 @@ export async function initialize(): Promise<void> {
const sponsors = loadSponsors();
console.log(`✅ Visa sponsor service initialized with ${sponsors.length} sponsors`);
}
// Start the scheduler for automatic daily updates at 2 AM
startScheduler(2);
}