keywords can be set from UI

This commit is contained in:
DaKheera47 2025-12-26 22:25:55 +00:00
parent bd7baafbec
commit 572cb1d42d
9 changed files with 250 additions and 76 deletions

View File

@ -17,11 +17,30 @@ const locations = [
];
// roles
const roles = [
const defaultRoles = [
"web-development",
"software-systems",
];
let roles = defaultRoles;
const envRolesRaw = process.env.GRADCRACKER_SEARCH_TERMS;
if (envRolesRaw) {
try {
const parsed = JSON.parse(envRolesRaw) as string[];
if (Array.isArray(parsed) && parsed.length > 0) {
roles = parsed.map(term =>
term.toLowerCase()
.replace(/[^a-z0-9]+/g, '-')
.replace(/^-+|-+$/g, '')
);
console.log(`Using configured search terms: ${roles.join(', ')}`);
}
} catch (e) {
console.warn('Failed to parse GRADCRACKER_SEARCH_TERMS', e);
}
}
// combo of locations and roles
const gradcrackerUrls = locations.flatMap((location) => {
return roles.map((role) => {

View File

@ -106,6 +106,7 @@ export async function updateSettings(update: {
jobCompleteWebhookUrl?: string | null
resumeProjects?: ResumeProjectsSettings | null
ukvisajobsMaxJobs?: number | null
searchTerms?: string[] | null
}): Promise<AppSettings> {
return fetchApi<AppSettings>('/settings', {
method: 'PATCH',

View File

@ -43,6 +43,7 @@ export const SettingsPage: React.FC = () => {
const [jobCompleteWebhookUrlDraft, setJobCompleteWebhookUrlDraft] = useState("")
const [resumeProjectsDraft, setResumeProjectsDraft] = useState<ResumeProjectsSettings | null>(null)
const [ukvisajobsMaxJobsDraft, setUkvisajobsMaxJobsDraft] = useState<number | null>(null)
const [searchTermsDraft, setSearchTermsDraft] = useState<string[] | null>(null)
const [isSaving, setIsSaving] = useState(false)
const [isLoading, setIsLoading] = useState(true)
@ -59,6 +60,7 @@ export const SettingsPage: React.FC = () => {
setJobCompleteWebhookUrlDraft(data.overrideJobCompleteWebhookUrl ?? "")
setResumeProjectsDraft(data.resumeProjects)
setUkvisajobsMaxJobsDraft(data.overrideUkvisajobsMaxJobs)
setSearchTermsDraft(data.overrideSearchTerms)
})
.catch((error) => {
const message = error instanceof Error ? error.message : "Failed to load settings"
@ -86,6 +88,9 @@ export const SettingsPage: React.FC = () => {
const effectiveUkvisajobsMaxJobs = settings?.ukvisajobsMaxJobs ?? 50
const defaultUkvisajobsMaxJobs = settings?.defaultUkvisajobsMaxJobs ?? 50
const overrideUkvisajobsMaxJobs = settings?.overrideUkvisajobsMaxJobs
const effectiveSearchTerms = settings?.searchTerms ?? []
const defaultSearchTerms = settings?.defaultSearchTerms ?? []
const overrideSearchTerms = settings?.overrideSearchTerms
const profileProjects = settings?.profileProjects ?? []
const maxProjectsTotal = profileProjects.length
const lockedCount = resumeProjectsDraft?.lockedProjectIds.length ?? 0
@ -99,12 +104,14 @@ export const SettingsPage: React.FC = () => {
const nextJobCompleteWebhook = jobCompleteWebhookUrlDraft.trim()
const currentJobCompleteWebhook = (overrideJobCompleteWebhookUrl ?? "").trim()
const ukvisajobsChanged = ukvisajobsMaxJobsDraft !== (overrideUkvisajobsMaxJobs ?? null)
const searchTermsChanged = JSON.stringify(searchTermsDraft) !== JSON.stringify(overrideSearchTerms ?? null)
return (
next !== current ||
nextWebhook !== currentWebhook ||
nextJobCompleteWebhook !== currentJobCompleteWebhook ||
!resumeProjectsEqual(resumeProjectsDraft, settings.resumeProjects) ||
ukvisajobsChanged
ukvisajobsChanged ||
searchTermsChanged
)
}, [
settings,
@ -117,6 +124,8 @@ export const SettingsPage: React.FC = () => {
resumeProjectsDraft,
ukvisajobsMaxJobsDraft,
overrideUkvisajobsMaxJobs,
searchTermsDraft,
overrideSearchTerms,
])
const handleSave = async () => {
@ -130,12 +139,14 @@ export const SettingsPage: React.FC = () => {
? null
: resumeProjectsDraft
const ukvisajobsMaxJobsOverride = ukvisajobsMaxJobsDraft === defaultUkvisajobsMaxJobs ? null : ukvisajobsMaxJobsDraft
const searchTermsOverride = arraysEqual(searchTermsDraft ?? [], defaultSearchTerms) ? null : searchTermsDraft
const updated = await api.updateSettings({
model: trimmed.length > 0 ? trimmed : null,
pipelineWebhookUrl: webhookTrimmed.length > 0 ? webhookTrimmed : null,
jobCompleteWebhookUrl: jobCompleteTrimmed.length > 0 ? jobCompleteTrimmed : null,
resumeProjects: resumeProjectsOverride,
ukvisajobsMaxJobs: ukvisajobsMaxJobsOverride,
searchTerms: searchTermsOverride,
})
setSettings(updated)
setModelDraft(updated.overrideModel ?? "")
@ -143,6 +154,7 @@ export const SettingsPage: React.FC = () => {
setJobCompleteWebhookUrlDraft(updated.overrideJobCompleteWebhookUrl ?? "")
setResumeProjectsDraft(updated.resumeProjects)
setUkvisajobsMaxJobsDraft(updated.overrideUkvisajobsMaxJobs)
setSearchTermsDraft(updated.overrideSearchTerms)
toast.success("Settings saved")
} catch (error) {
const message = error instanceof Error ? error.message : "Failed to save settings"
@ -161,6 +173,7 @@ export const SettingsPage: React.FC = () => {
jobCompleteWebhookUrl: null,
resumeProjects: null,
ukvisajobsMaxJobs: null,
searchTerms: null,
})
setSettings(updated)
setModelDraft("")
@ -168,6 +181,7 @@ export const SettingsPage: React.FC = () => {
setJobCompleteWebhookUrlDraft("")
setResumeProjectsDraft(updated.resumeProjects)
setUkvisajobsMaxJobsDraft(null)
setSearchTermsDraft(null)
toast.success("Reset to default")
} catch (error) {
const message = error instanceof Error ? error.message : "Failed to reset settings"
@ -330,6 +344,52 @@ export const SettingsPage: React.FC = () => {
</CardContent>
</Card>
<Card>
<CardHeader>
<CardTitle className="text-base">Search Terms</CardTitle>
</CardHeader>
<CardContent className="space-y-4">
<div className="space-y-2">
<div className="text-sm font-medium">Global search terms</div>
<textarea
className="flex min-h-[80px] w-full rounded-md border border-input bg-background px-3 py-2 text-sm ring-offset-background placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:cursor-not-allowed disabled:opacity-50"
value={searchTermsDraft ? searchTermsDraft.join('\n') : (defaultSearchTerms ?? []).join('\n')}
onChange={(event) => {
const text = event.target.value
const terms = text.split('\n') // Don't filter here to allow empty lines while typing
setSearchTermsDraft(terms)
}}
onBlur={() => {
// Clean up on blur
if (searchTermsDraft) {
setSearchTermsDraft(searchTermsDraft.map(t => t.trim()).filter(Boolean))
}
}}
placeholder="e.g. web developer"
disabled={isLoading || isSaving}
rows={5}
/>
<div className="text-xs text-muted-foreground">
One term per line. Applies to UKVisaJobs and other supported extractors.
</div>
</div>
<Separator />
<div className="grid gap-2 text-sm sm:grid-cols-2">
<div>
<div className="text-xs text-muted-foreground">Effective</div>
<div className="break-words font-mono text-xs">{(effectiveSearchTerms || []).join(', ') || "—"}</div>
</div>
<div>
<div className="text-xs text-muted-foreground">Default (env)</div>
<div className="break-words font-mono text-xs">{(defaultSearchTerms || []).join(', ') || "—"}</div>
</div>
</div>
</CardContent>
</Card>
<Card>
<CardHeader>
<CardTitle className="text-base">Resume Projects</CardTitle>

View File

@ -243,6 +243,13 @@ apiRouter.get('/settings', async (_req: Request, res: Response) => {
const overrideUkvisajobsMaxJobs = overrideUkvisajobsMaxJobsRaw ? parseInt(overrideUkvisajobsMaxJobsRaw, 10) : null;
const ukvisajobsMaxJobs = overrideUkvisajobsMaxJobs ?? defaultUkvisajobsMaxJobs;
// Search terms - stored as JSON array, default from env var (pipe-separated)
const overrideSearchTermsRaw = await settingsRepo.getSetting('searchTerms');
const defaultSearchTermsEnv = process.env.JOBSPY_SEARCH_TERMS || 'web developer';
const defaultSearchTerms = defaultSearchTermsEnv.split('|').map(s => s.trim()).filter(Boolean);
const overrideSearchTerms = overrideSearchTermsRaw ? JSON.parse(overrideSearchTermsRaw) as string[] : null;
const searchTerms = overrideSearchTerms ?? defaultSearchTerms;
res.json({
success: true,
data: {
@ -259,6 +266,9 @@ apiRouter.get('/settings', async (_req: Request, res: Response) => {
ukvisajobsMaxJobs,
defaultUkvisajobsMaxJobs,
overrideUkvisajobsMaxJobs,
searchTerms,
defaultSearchTerms,
overrideSearchTerms,
},
});
} catch (error) {
@ -277,6 +287,7 @@ const updateSettingsSchema = z.object({
aiSelectableProjectIds: z.array(z.string().trim().min(1)).max(200),
}).nullable().optional(),
ukvisajobsMaxJobs: z.number().int().min(1).max(200).nullable().optional(),
searchTerms: z.array(z.string().trim().min(1).max(200)).max(50).nullable().optional(),
});
/**
@ -320,6 +331,11 @@ apiRouter.patch('/settings', async (req: Request, res: Response) => {
await settingsRepo.setSetting('ukvisajobsMaxJobs', ukvisajobsMaxJobs !== null ? String(ukvisajobsMaxJobs) : null);
}
if ('searchTerms' in input) {
const searchTerms = input.searchTerms ?? null;
await settingsRepo.setSetting('searchTerms', searchTerms !== null ? JSON.stringify(searchTerms) : null);
}
const overrideModel = await settingsRepo.getSetting('model');
const defaultModel = process.env.MODEL || 'openai/gpt-4o-mini';
const model = overrideModel || defaultModel;
@ -342,6 +358,13 @@ apiRouter.patch('/settings', async (req: Request, res: Response) => {
const overrideUkvisajobsMaxJobs = overrideUkvisajobsMaxJobsRaw ? parseInt(overrideUkvisajobsMaxJobsRaw, 10) : null;
const ukvisajobsMaxJobs = overrideUkvisajobsMaxJobs ?? defaultUkvisajobsMaxJobs;
// Search terms - stored as JSON array, default from env var (pipe-separated)
const overrideSearchTermsRaw = await settingsRepo.getSetting('searchTerms');
const defaultSearchTermsEnv = process.env.JOBSPY_SEARCH_TERMS || 'web developer';
const defaultSearchTerms = defaultSearchTermsEnv.split('|').map(s => s.trim()).filter(Boolean);
const overrideSearchTerms = overrideSearchTermsRaw ? JSON.parse(overrideSearchTermsRaw) as string[] : null;
const searchTerms = overrideSearchTerms ?? defaultSearchTerms;
res.json({
success: true,
data: {
@ -358,6 +381,9 @@ apiRouter.patch('/settings', async (req: Request, res: Response) => {
ukvisajobsMaxJobs,
defaultUkvisajobsMaxJobs,
overrideUkvisajobsMaxJobs,
searchTerms,
defaultSearchTerms,
overrideSearchTerms,
},
});
} catch (error) {

View File

@ -108,35 +108,22 @@ export async function runPipeline(config: Partial<PipelineConfig> = {}): Promise
// Step 2: Run crawler
console.log('\n🕷 Running crawler...');
progressHelpers.startCrawling();
const existingJobUrls = await jobsRepo.getAllJobUrls();
const discoveredJobs: CreateJobInput[] = [];
const sourceErrors: string[] = [];
if (mergedConfig.sources.includes('gradcracker')) {
const crawlerResult = await runCrawler({
existingJobUrls,
onProgress: (update) => {
progressHelpers.crawlingUpdate({
listPagesProcessed: update.listPagesProcessed,
listPagesTotal: update.listPagesTotal,
jobCardsFound: update.jobCardsFound,
jobPagesEnqueued: update.jobPagesEnqueued,
jobPagesSkipped: update.jobPagesSkipped,
jobPagesProcessed: update.jobPagesProcessed,
phase: update.phase,
currentUrl: update.currentUrl,
});
},
});
// Read search terms setting
const searchTermsSetting = await settingsRepo.getSetting('searchTerms');
let searchTerms: string[] = [];
if (!crawlerResult.success) {
sourceErrors.push(`gradcracker: ${crawlerResult.error ?? 'unknown error'}`);
} else {
discoveredJobs.push(...crawlerResult.jobs);
}
if (searchTermsSetting) {
searchTerms = JSON.parse(searchTermsSetting) as string[];
} else {
// Default from env var
const defaultSearchTermsEnv = process.env.JOBSPY_SEARCH_TERMS || 'web developer';
searchTerms = defaultSearchTermsEnv.split('|').map(s => s.trim()).filter(Boolean);
}
// Run JobSpy (Indeed/LinkedIn) if selected
const jobSpySites = mergedConfig.sources.filter(
(s): s is 'indeed' | 'linkedin' => s === 'indeed' || s === 'linkedin'
);
@ -147,7 +134,10 @@ export async function runPipeline(config: Partial<PipelineConfig> = {}): Promise
detail: `JobSpy: scraping ${jobSpySites.join(', ')}...`,
});
const jobSpyResult = await runJobSpy({ sites: jobSpySites });
const jobSpyResult = await runJobSpy({
sites: jobSpySites,
searchTerms,
});
if (!jobSpyResult.success) {
sourceErrors.push(`jobspy: ${jobSpyResult.error ?? 'unknown error'}`);
} else {
@ -155,6 +145,39 @@ export async function runPipeline(config: Partial<PipelineConfig> = {}): Promise
}
}
// Run Gradcracker crawler if selected
if (mergedConfig.sources.includes('gradcracker')) {
updateProgress({
step: 'crawling',
detail: 'Gradcracker: scraping...',
});
// Pass existing URLs to avoid clicking "Apply" on jobs we already have
const existingJobUrls = await jobsRepo.getAllJobUrls();
const crawlerResult = await runCrawler({
existingJobUrls,
searchTerms,
onProgress: (progress) => {
// Calculate overall progress based on list pages processed vs total
// This is rough but better than nothing
if (progress.listPagesTotal && progress.listPagesTotal > 0) {
const percent = Math.round((progress.listPagesProcessed ?? 0) / progress.listPagesTotal * 100);
updateProgress({
step: 'crawling',
detail: `Gradcracker: ${percent}% (scan ${progress.listPagesProcessed}/${progress.listPagesTotal}, found ${progress.jobCardsFound})`,
});
}
},
});
if (!crawlerResult.success) {
sourceErrors.push(`gradcracker: ${crawlerResult.error ?? 'unknown error'}`);
} else {
discoveredJobs.push(...crawlerResult.jobs);
}
}
// Run UKVisaJobs extractor if selected
if (mergedConfig.sources.includes('ukvisajobs')) {
updateProgress({
@ -166,7 +189,10 @@ export async function runPipeline(config: Partial<PipelineConfig> = {}): Promise
const ukvisajobsMaxJobsSetting = await settingsRepo.getSetting('ukvisajobsMaxJobs');
const ukvisajobsMaxJobs = ukvisajobsMaxJobsSetting ? parseInt(ukvisajobsMaxJobsSetting, 10) : 50;
const ukVisaResult = await runUkVisaJobs({ maxJobs: ukvisajobsMaxJobs });
const ukVisaResult = await runUkVisaJobs({
maxJobs: ukvisajobsMaxJobs,
searchTerms,
});
if (!ukVisaResult.success) {
sourceErrors.push(`ukvisajobs: ${ukVisaResult.error ?? 'unknown error'}`);
} else {

View File

@ -12,6 +12,7 @@ export type SettingKey = 'model'
| 'jobCompleteWebhookUrl'
| 'resumeProjects'
| 'ukvisajobsMaxJobs'
| 'searchTerms'
export async function getSetting(key: SettingKey): Promise<string | null> {
const [row] = await db.select().from(settings).where(eq(settings.key, key))

View File

@ -32,6 +32,11 @@ export interface RunCrawlerOptions {
* Optional callback for live crawl progress emitted by the Gradcracker extractor.
*/
onProgress?: (update: JobExtractorProgress) => void;
/**
* List of search terms to be used as roles for URL generation.
*/
searchTerms?: string[];
}
interface JobExtractorProgress {
@ -61,13 +66,13 @@ async function writeExistingJobUrlsFile(existingJobUrls: string[] | undefined):
*/
export async function runCrawler(options: RunCrawlerOptions = {}): Promise<CrawlerResult> {
console.log('🕷️ Starting job crawler...');
try {
// Clear previous results
await clearStorageDataset();
const existingJobUrlsFile = await writeExistingJobUrlsFile(options.existingJobUrls);
// Run the crawler
await new Promise<void>((resolve, reject) => {
const child = spawn('npm', ['run', 'start'], {
@ -78,6 +83,7 @@ export async function runCrawler(options: RunCrawlerOptions = {}): Promise<Crawl
...process.env,
JOBOPS_SKIP_APPLY_FOR_EXISTING: '1',
JOBOPS_EMIT_PROGRESS: '1',
GRADCRACKER_SEARCH_TERMS: options.searchTerms ? JSON.stringify(options.searchTerms) : '',
...(existingJobUrlsFile ? { JOBOPS_EXISTING_JOB_URLS_FILE: existingJobUrlsFile } : {}),
},
});
@ -101,7 +107,7 @@ export async function runCrawler(options: RunCrawlerOptions = {}): Promise<Crawl
stdoutRl?.on('line', (line) => handleLine(line, process.stdout));
stderrRl?.on('line', (line) => handleLine(line, process.stderr));
child.on('close', (code) => {
stdoutRl?.close();
stderrRl?.close();
@ -111,15 +117,15 @@ export async function runCrawler(options: RunCrawlerOptions = {}): Promise<Crawl
reject(new Error(`Crawler exited with code ${code}`));
}
});
child.on('error', reject);
});
// Read crawled jobs from storage
const jobs = await readCrawledJobs();
console.log(`✅ Crawler completed. Found ${jobs.length} jobs.`);
return { success: true, jobs };
} catch (error) {
const message = error instanceof Error ? error.message : 'Unknown error';
@ -135,13 +141,13 @@ async function readCrawledJobs(): Promise<CreateJobInput[]> {
try {
const files = await readdir(STORAGE_DIR);
const jsonFiles = files.filter(f => f.endsWith('.json'));
const jobs: CreateJobInput[] = [];
for (const file of jsonFiles) {
const content = await readFile(join(STORAGE_DIR, file), 'utf-8');
const data = JSON.parse(content);
// Map crawler output to our job input format
jobs.push({
source: 'gradcracker',
@ -159,7 +165,7 @@ async function readCrawledJobs(): Promise<CreateJobInput[]> {
jobDescription: data.jobDescription,
});
}
return jobs;
} catch (error) {
console.error('Failed to read crawled jobs:', error);

View File

@ -15,10 +15,12 @@ const UKVISAJOBS_DIR = join(__dirname, '../../../../extractors/ukvisajobs');
const STORAGE_DIR = join(UKVISAJOBS_DIR, 'storage/datasets/default');
export interface RunUkVisaJobsOptions {
/** Maximum number of jobs to fetch. Defaults to 50, max 200. */
/** Maximum number of jobs to fetch per search term. Defaults to 50, max 200. */
maxJobs?: number;
/** Search keyword filter (optional) */
/** Search keyword filter (single) - legacy support */
searchKeyword?: string;
/** List of search terms to run sequentially */
searchTerms?: string[];
}
export interface UkVisaJobsResult {
@ -38,46 +40,76 @@ async function clearStorageDataset(): Promise<void> {
}
}
/**
* Run the UK Visa Jobs extractor.
*/
export async function runUkVisaJobs(options: RunUkVisaJobsOptions = {}): Promise<UkVisaJobsResult> {
console.log('🇬🇧 Running UK Visa Jobs extractor...');
try {
// Clear previous results
await clearStorageDataset();
await mkdir(STORAGE_DIR, { recursive: true });
// Run the extractor using npx tsx directly (more reliable in Docker/different environments)
await new Promise<void>((resolve, reject) => {
const child = spawn('npx', ['tsx', 'src/main.ts'], {
cwd: UKVISAJOBS_DIR,
stdio: 'inherit',
env: {
...process.env,
UKVISAJOBS_MAX_JOBS: String(options.maxJobs ?? 50),
UKVISAJOBS_SEARCH_KEYWORD: options.searchKeyword ?? '',
},
});
child.on('close', (code) => {
if (code === 0) resolve();
else reject(new Error(`UK Visa Jobs extractor exited with code ${code}`));
});
child.on('error', reject);
});
// Read the output dataset
const jobs = await readDataset();
console.log(`✅ UK Visa Jobs: imported ${jobs.length} jobs`);
return { success: true, jobs };
} catch (error) {
const message = error instanceof Error ? error.message : 'Unknown error';
console.error(`❌ UK Visa Jobs failed: ${message}`);
return { success: false, jobs: [], error: message };
// Determine terms to run
const terms: string[] = [];
if (options.searchTerms && options.searchTerms.length > 0) {
terms.push(...options.searchTerms);
} else if (options.searchKeyword) {
terms.push(options.searchKeyword);
} else {
// No search terms = run once without keyword
terms.push('');
}
const allJobs: CreateJobInput[] = [];
const seenIds = new Set<string>();
for (const term of terms) {
const termLabel = term ? `"${term}"` : 'all jobs';
console.log(` Running for ${termLabel}...`);
try {
// Clear previous results for this run
await clearStorageDataset();
await mkdir(STORAGE_DIR, { recursive: true });
// Run the extractor
await new Promise<void>((resolve, reject) => {
const child = spawn('npx', ['tsx', 'src/main.ts'], {
cwd: UKVISAJOBS_DIR,
stdio: 'inherit',
env: {
...process.env,
UKVISAJOBS_MAX_JOBS: String(options.maxJobs ?? 50),
UKVISAJOBS_SEARCH_KEYWORD: term,
},
});
child.on('close', (code) => {
if (code === 0) resolve();
else reject(new Error(`UK Visa Jobs extractor exited with code ${code}`));
});
child.on('error', reject);
});
// Read the output dataset and accumulate
const runJobs = await readDataset();
let newCount = 0;
for (const job of runJobs) {
// Deduplicate by sourceJobId or jobUrl
const id = job.sourceJobId || job.jobUrl;
if (!seenIds.has(id)) {
seenIds.add(id);
allJobs.push(job);
newCount++;
}
}
console.log(` ✅ Fetched ${runJobs.length} jobs for ${termLabel} (${newCount} new unique)`);
} catch (error) {
const message = error instanceof Error ? error.message : 'Unknown error';
console.error(`❌ UK Visa Jobs failed for ${termLabel}: ${message}`);
// Continue to next term instead of failing completely
}
}
console.log(`✅ UK Visa Jobs: imported total ${allJobs.length} unique jobs`);
return { success: true, jobs: allJobs };
}
/**

View File

@ -204,4 +204,7 @@ export interface AppSettings {
ukvisajobsMaxJobs: number;
defaultUkvisajobsMaxJobs: number;
overrideUkvisajobsMaxJobs: number | null;
searchTerms: string[];
defaultSearchTerms: string[];
overrideSearchTerms: string[] | null;
}