From 2cf9249159390391c28677a6a7724bb879e59f29 Mon Sep 17 00:00:00 2001 From: DaKheera47 Date: Thu, 15 Jan 2026 19:17:23 +0000 Subject: [PATCH] gradcracker limits --- extractors/gradcracker/src/main.ts | 9 ++- extractors/gradcracker/src/routes.ts | 31 ++++++++- orchestrator/src/client/api/client.ts | 1 + .../src/client/pages/SettingsPage.tsx | 67 +++++++++++++++++-- orchestrator/src/server/api/routes.ts | 22 ++++++ .../src/server/pipeline/orchestrator.ts | 4 ++ .../src/server/repositories/settings.ts | 1 + orchestrator/src/server/services/crawler.ts | 6 ++ orchestrator/src/shared/types.ts | 3 + 9 files changed, 136 insertions(+), 8 deletions(-) diff --git a/extractors/gradcracker/src/main.ts b/extractors/gradcracker/src/main.ts index 09f910a..989ae55 100644 --- a/extractors/gradcracker/src/main.ts +++ b/extractors/gradcracker/src/main.ts @@ -44,15 +44,18 @@ if (envRolesRaw) { // combo of locations and roles const gradcrackerUrls = locations.flatMap((location) => { return roles.map((role) => { - return `https://www.gradcracker.com/search/computing-technology/${role}-graduate-jobs-in-${location}?order=dateAdded`; + return { + url: `https://www.gradcracker.com/search/computing-technology/${role}-graduate-jobs-in-${location}?order=dateAdded`, + role + }; }); }); console.log(`Total gradcracker URLs: ${gradcrackerUrls.length}`) -const startUrls = gradcrackerUrls.map((url) => ({ +const startUrls = gradcrackerUrls.map(({ url, role }) => ({ url, - userData: { label: "gradcracker-list-page" }, + userData: { label: "gradcracker-list-page", role }, })); initJobOpsProgress(startUrls.length); diff --git a/extractors/gradcracker/src/routes.ts b/extractors/gradcracker/src/routes.ts index be9ba86..b6a440a 100644 --- a/extractors/gradcracker/src/routes.ts +++ b/extractors/gradcracker/src/routes.ts @@ -44,6 +44,10 @@ function getExistingJobUrlSet(): Set { const SKIP_APPLY_FOR_EXISTING = process.env.JOBOPS_SKIP_APPLY_FOR_EXISTING === "1"; const EXISTING_JOB_URLS = getExistingJobUrlSet(); +// Global counters for max jobs per search term +const jobCounts = new Map(); +const MAX_JOBS_PER_TERM = parseInt(process.env.GRADCRACKER_MAX_JOBS_PER_TERM || "0", 10); + interface Job { title: string | null; jobUrl: string | null; @@ -62,7 +66,22 @@ export const router = createPlaywrightRouter(); router.addHandler( "gradcracker-list-page", async ({ page, request, enqueueLinks }) => { - log.info(`Processing: ${request.url}`); + const { role } = request.userData; + log.info(`Processing: ${request.url} (Role: ${role})`); + + if (MAX_JOBS_PER_TERM > 0) { + const currentCount = jobCounts.get(role) || 0; + if (currentCount >= MAX_JOBS_PER_TERM) { + log.info(`Max jobs (${MAX_JOBS_PER_TERM}) already enqueued for role "${role}". Skipping list page.`); + markListPageDone({ + currentUrl: request.url, + jobCardsFound: 0, + jobPagesEnqueued: 0, + jobPagesSkipped: 0, + }); + return; + } + } // Wait until the job cards are rendered await page.waitForSelector("article[wire\\:key]", { timeout: 10000 }); @@ -172,6 +191,16 @@ router.addHandler( if (isKnownJob) { skippedKnownJobs++; } else { + // Check if we reached the limit for this search term + if (MAX_JOBS_PER_TERM > 0) { + const currentCount = jobCounts.get(role) || 0; + if (currentCount >= MAX_JOBS_PER_TERM) { + log.info(`Reached max jobs limit (${MAX_JOBS_PER_TERM}) for role "${role}" while processing list. Stopping.`); + break; + } + jobCounts.set(role, currentCount + 1); + } + await enqueueLinks({ urls: [jobUrl], userData: { diff --git a/orchestrator/src/client/api/client.ts b/orchestrator/src/client/api/client.ts index aa7264e..6fe30d4 100644 --- a/orchestrator/src/client/api/client.ts +++ b/orchestrator/src/client/api/client.ts @@ -154,6 +154,7 @@ export async function updateSettings(update: { jobCompleteWebhookUrl?: string | null resumeProjects?: ResumeProjectsSettings | null ukvisajobsMaxJobs?: number | null + gradcrackerMaxJobsPerTerm?: number | null searchTerms?: string[] | null jobspyLocation?: string | null jobspyResultsWanted?: number | null diff --git a/orchestrator/src/client/pages/SettingsPage.tsx b/orchestrator/src/client/pages/SettingsPage.tsx index 51ad272..a01d1f7 100644 --- a/orchestrator/src/client/pages/SettingsPage.tsx +++ b/orchestrator/src/client/pages/SettingsPage.tsx @@ -78,6 +78,7 @@ export const SettingsPage: React.FC = () => { const [jobCompleteWebhookUrlDraft, setJobCompleteWebhookUrlDraft] = useState("") const [resumeProjectsDraft, setResumeProjectsDraft] = useState(null) const [ukvisajobsMaxJobsDraft, setUkvisajobsMaxJobsDraft] = useState(null) + const [gradcrackerMaxJobsPerTermDraft, setGradcrackerMaxJobsPerTermDraft] = useState(null) const [searchTermsDraft, setSearchTermsDraft] = useState(null) const [jobspyLocationDraft, setJobspyLocationDraft] = useState(null) const [jobspyResultsWantedDraft, setJobspyResultsWantedDraft] = useState(null) @@ -105,6 +106,7 @@ export const SettingsPage: React.FC = () => { setJobCompleteWebhookUrlDraft(data.overrideJobCompleteWebhookUrl ?? "") setResumeProjectsDraft(data.resumeProjects) setUkvisajobsMaxJobsDraft(data.overrideUkvisajobsMaxJobs) + setGradcrackerMaxJobsPerTermDraft(data.overrideGradcrackerMaxJobsPerTerm) setSearchTermsDraft(data.overrideSearchTerms) setJobspyLocationDraft(data.overrideJobspyLocation) setJobspyResultsWantedDraft(data.overrideJobspyResultsWanted) @@ -145,6 +147,9 @@ export const SettingsPage: React.FC = () => { const effectiveUkvisajobsMaxJobs = settings?.ukvisajobsMaxJobs ?? 50 const defaultUkvisajobsMaxJobs = settings?.defaultUkvisajobsMaxJobs ?? 50 const overrideUkvisajobsMaxJobs = settings?.overrideUkvisajobsMaxJobs + const effectiveGradcrackerMaxJobsPerTerm = settings?.gradcrackerMaxJobsPerTerm ?? 50 + const defaultGradcrackerMaxJobsPerTerm = settings?.defaultGradcrackerMaxJobsPerTerm ?? 50 + const overrideGradcrackerMaxJobsPerTerm = settings?.overrideGradcrackerMaxJobsPerTerm const effectiveSearchTerms = settings?.searchTerms ?? [] const defaultSearchTerms = settings?.defaultSearchTerms ?? [] const overrideSearchTerms = settings?.overrideSearchTerms @@ -185,6 +190,7 @@ export const SettingsPage: React.FC = () => { const nextJobCompleteWebhook = jobCompleteWebhookUrlDraft.trim() const currentJobCompleteWebhook = (overrideJobCompleteWebhookUrl ?? "").trim() const ukvisajobsChanged = ukvisajobsMaxJobsDraft !== (overrideUkvisajobsMaxJobs ?? null) + const gradcrackerChanged = gradcrackerMaxJobsPerTermDraft !== (overrideGradcrackerMaxJobsPerTerm ?? null) const searchTermsChanged = JSON.stringify(searchTermsDraft) !== JSON.stringify(overrideSearchTerms ?? null) return ( next !== current || @@ -195,6 +201,7 @@ export const SettingsPage: React.FC = () => { nextJobCompleteWebhook !== currentJobCompleteWebhook || !resumeProjectsEqual(resumeProjectsDraft, settings.resumeProjects) || ukvisajobsChanged || + gradcrackerChanged || searchTermsChanged || jobspyLocationDraft !== (overrideJobspyLocation ?? null) || jobspyResultsWantedDraft !== (overrideJobspyResultsWanted ?? null) || @@ -220,6 +227,8 @@ export const SettingsPage: React.FC = () => { resumeProjectsDraft, ukvisajobsMaxJobsDraft, overrideUkvisajobsMaxJobs, + gradcrackerMaxJobsPerTermDraft, + overrideGradcrackerMaxJobsPerTerm, searchTermsDraft, overrideSearchTerms, jobspyLocationDraft, @@ -250,6 +259,7 @@ export const SettingsPage: React.FC = () => { ? null : resumeProjectsDraft const ukvisajobsMaxJobsOverride = ukvisajobsMaxJobsDraft === defaultUkvisajobsMaxJobs ? null : ukvisajobsMaxJobsDraft + const gradcrackerMaxJobsPerTermOverride = gradcrackerMaxJobsPerTermDraft === defaultGradcrackerMaxJobsPerTerm ? null : gradcrackerMaxJobsPerTermDraft const searchTermsOverride = arraysEqual(searchTermsDraft ?? [], defaultSearchTerms) ? null : searchTermsDraft const jobspyLocationOverride = jobspyLocationDraft === defaultJobspyLocation ? null : jobspyLocationDraft const jobspyResultsWantedOverride = jobspyResultsWantedDraft === defaultJobspyResultsWanted ? null : jobspyResultsWantedDraft @@ -266,6 +276,7 @@ export const SettingsPage: React.FC = () => { jobCompleteWebhookUrl: jobCompleteTrimmed.length > 0 ? jobCompleteTrimmed : null, resumeProjects: resumeProjectsOverride, ukvisajobsMaxJobs: ukvisajobsMaxJobsOverride, + gradcrackerMaxJobsPerTerm: gradcrackerMaxJobsPerTermOverride, searchTerms: searchTermsOverride, jobspyLocation: jobspyLocationOverride, jobspyResultsWanted: jobspyResultsWantedOverride, @@ -283,6 +294,7 @@ export const SettingsPage: React.FC = () => { setJobCompleteWebhookUrlDraft(updated.overrideJobCompleteWebhookUrl ?? "") setResumeProjectsDraft(updated.resumeProjects) setUkvisajobsMaxJobsDraft(updated.overrideUkvisajobsMaxJobs) + setGradcrackerMaxJobsPerTermDraft(updated.overrideGradcrackerMaxJobsPerTerm) setSearchTermsDraft(updated.overrideSearchTerms) setJobspyLocationDraft(updated.overrideJobspyLocation) setJobspyResultsWantedDraft(updated.overrideJobspyResultsWanted) @@ -365,6 +377,7 @@ export const SettingsPage: React.FC = () => { jobCompleteWebhookUrl: null, resumeProjects: null, ukvisajobsMaxJobs: null, + gradcrackerMaxJobsPerTerm: null, searchTerms: null, jobspyLocation: null, jobspyResultsWanted: null, @@ -382,6 +395,7 @@ export const SettingsPage: React.FC = () => { setJobCompleteWebhookUrlDraft("") setResumeProjectsDraft(updated.resumeProjects) setUkvisajobsMaxJobsDraft(null) + setGradcrackerMaxJobsPerTermDraft(null) setSearchTermsDraft(null) setJobspyLocationDraft(null) setJobspyResultsWantedDraft(null) @@ -573,20 +587,20 @@ export const SettingsPage: React.FC = () => { type="number" inputMode="numeric" min={1} - max={200} + max={1000} value={ukvisajobsMaxJobsDraft ?? defaultUkvisajobsMaxJobs} onChange={(event) => { const value = parseInt(event.target.value, 10) if (Number.isNaN(value)) { setUkvisajobsMaxJobsDraft(null) } else { - setUkvisajobsMaxJobsDraft(Math.min(200, Math.max(1, value))) + setUkvisajobsMaxJobsDraft(Math.min(1000, Math.max(1, value))) } }} disabled={isLoading || isSaving} />
- Maximum number of jobs to fetch from UKVisaJobs per pipeline run. Range: 1-200. + Maximum number of jobs to fetch from UKVisaJobs per pipeline run. Range: 1-1000.
@@ -599,7 +613,52 @@ export const SettingsPage: React.FC = () => {
Default
-
{defaultUkvisajobsMaxJobs}
+
{defaultUkvisajobsMaxJobs}
+
+ + + + + + + + Gradcracker Extractor + + +
+
+
Max jobs per search term
+ { + const value = parseInt(event.target.value, 10) + if (Number.isNaN(value)) { + setGradcrackerMaxJobsPerTermDraft(null) + } else { + setGradcrackerMaxJobsPerTermDraft(Math.min(1000, Math.max(1, value))) + } + }} + disabled={isLoading || isSaving} + /> +
+ Maximum number of jobs to fetch for EACH search term from Gradcracker. Range: 1-1000. +
+
+ + + +
+
+
Effective
+
{effectiveGradcrackerMaxJobsPerTerm}
+
+
+
Default
+
{defaultGradcrackerMaxJobsPerTerm}
diff --git a/orchestrator/src/server/api/routes.ts b/orchestrator/src/server/api/routes.ts index 5a6844c..3d8ce5e 100644 --- a/orchestrator/src/server/api/routes.ts +++ b/orchestrator/src/server/api/routes.ts @@ -299,6 +299,11 @@ apiRouter.get('/settings', async (_req: Request, res: Response) => { const overrideUkvisajobsMaxJobs = overrideUkvisajobsMaxJobsRaw ? parseInt(overrideUkvisajobsMaxJobsRaw, 10) : null; const ukvisajobsMaxJobs = overrideUkvisajobsMaxJobs ?? defaultUkvisajobsMaxJobs; + const overrideGradcrackerMaxJobsPerTermRaw = await settingsRepo.getSetting('gradcrackerMaxJobsPerTerm'); + const defaultGradcrackerMaxJobsPerTerm = 50; + const overrideGradcrackerMaxJobsPerTerm = overrideGradcrackerMaxJobsPerTermRaw ? parseInt(overrideGradcrackerMaxJobsPerTermRaw, 10) : null; + const gradcrackerMaxJobsPerTerm = overrideGradcrackerMaxJobsPerTerm ?? defaultGradcrackerMaxJobsPerTerm; + const overrideSearchTermsRaw = await settingsRepo.getSetting('searchTerms'); const defaultSearchTermsEnv = process.env.JOBSPY_SEARCH_TERMS || 'web developer'; const defaultSearchTerms = defaultSearchTermsEnv.split('|').map(s => s.trim()).filter(Boolean); @@ -358,6 +363,9 @@ apiRouter.get('/settings', async (_req: Request, res: Response) => { ukvisajobsMaxJobs, defaultUkvisajobsMaxJobs, overrideUkvisajobsMaxJobs, + gradcrackerMaxJobsPerTerm, + defaultGradcrackerMaxJobsPerTerm, + overrideGradcrackerMaxJobsPerTerm, searchTerms, defaultSearchTerms, overrideSearchTerms, @@ -400,6 +408,7 @@ const updateSettingsSchema = z.object({ aiSelectableProjectIds: z.array(z.string().trim().min(1)).max(200), }).nullable().optional(), ukvisajobsMaxJobs: z.number().int().min(1).max(200).nullable().optional(), + gradcrackerMaxJobsPerTerm: z.number().int().min(1).max(200).nullable().optional(), searchTerms: z.array(z.string().trim().min(1).max(200)).max(50).nullable().optional(), jobspyLocation: z.string().trim().min(1).max(100).nullable().optional(), jobspyResultsWanted: z.number().int().min(1).max(500).nullable().optional(), @@ -460,6 +469,11 @@ apiRouter.patch('/settings', async (req: Request, res: Response) => { await settingsRepo.setSetting('ukvisajobsMaxJobs', ukvisajobsMaxJobs !== null ? String(ukvisajobsMaxJobs) : null); } + if ('gradcrackerMaxJobsPerTerm' in input) { + const gradcrackerMaxJobsPerTerm = input.gradcrackerMaxJobsPerTerm ?? null; + await settingsRepo.setSetting('gradcrackerMaxJobsPerTerm', gradcrackerMaxJobsPerTerm !== null ? String(gradcrackerMaxJobsPerTerm) : null); + } + if ('searchTerms' in input) { const searchTerms = input.searchTerms ?? null; await settingsRepo.setSetting('searchTerms', searchTerms !== null ? JSON.stringify(searchTerms) : null); @@ -526,6 +540,11 @@ apiRouter.patch('/settings', async (req: Request, res: Response) => { const overrideUkvisajobsMaxJobs = overrideUkvisajobsMaxJobsRaw ? parseInt(overrideUkvisajobsMaxJobsRaw, 10) : null; const ukvisajobsMaxJobs = overrideUkvisajobsMaxJobs ?? defaultUkvisajobsMaxJobs; + const overrideGradcrackerMaxJobsPerTermRaw = await settingsRepo.getSetting('gradcrackerMaxJobsPerTerm'); + const defaultGradcrackerMaxJobsPerTerm = 50; + const overrideGradcrackerMaxJobsPerTerm = overrideGradcrackerMaxJobsPerTermRaw ? parseInt(overrideGradcrackerMaxJobsPerTermRaw, 10) : null; + const gradcrackerMaxJobsPerTerm = overrideGradcrackerMaxJobsPerTerm ?? defaultGradcrackerMaxJobsPerTerm; + // Search terms - stored as JSON array, default from env var (pipe-separated) const overrideSearchTermsRaw = await settingsRepo.getSetting('searchTerms'); const defaultSearchTermsEnv = process.env.JOBSPY_SEARCH_TERMS || 'web developer'; @@ -586,6 +605,9 @@ apiRouter.patch('/settings', async (req: Request, res: Response) => { ukvisajobsMaxJobs, defaultUkvisajobsMaxJobs, overrideUkvisajobsMaxJobs, + gradcrackerMaxJobsPerTerm, + defaultGradcrackerMaxJobsPerTerm, + overrideGradcrackerMaxJobsPerTerm, searchTerms, defaultSearchTerms, overrideSearchTerms, diff --git a/orchestrator/src/server/pipeline/orchestrator.ts b/orchestrator/src/server/pipeline/orchestrator.ts index aaf6b38..c78ceb5 100644 --- a/orchestrator/src/server/pipeline/orchestrator.ts +++ b/orchestrator/src/server/pipeline/orchestrator.ts @@ -182,9 +182,13 @@ export async function runPipeline(config: Partial = {}): Promise // Pass existing URLs to avoid clicking "Apply" on jobs we already have const existingJobUrls = await jobsRepo.getAllJobUrls(); + const gradcrackerMaxJobsSetting = await settingsRepo.getSetting('gradcrackerMaxJobsPerTerm'); + const gradcrackerMaxJobs = gradcrackerMaxJobsSetting ? parseInt(gradcrackerMaxJobsSetting, 10) : 50; + const crawlerResult = await runCrawler({ existingJobUrls, searchTerms, + maxJobsPerTerm: gradcrackerMaxJobs, onProgress: (progress) => { // Calculate overall progress based on list pages processed vs total // This is rough but better than nothing diff --git a/orchestrator/src/server/repositories/settings.ts b/orchestrator/src/server/repositories/settings.ts index 4bede73..088f860 100644 --- a/orchestrator/src/server/repositories/settings.ts +++ b/orchestrator/src/server/repositories/settings.ts @@ -15,6 +15,7 @@ export type SettingKey = 'model' | 'jobCompleteWebhookUrl' | 'resumeProjects' | 'ukvisajobsMaxJobs' + | 'gradcrackerMaxJobsPerTerm' | 'searchTerms' | 'jobspyLocation' | 'jobspyResultsWanted' diff --git a/orchestrator/src/server/services/crawler.ts b/orchestrator/src/server/services/crawler.ts index 11c692a..1d72e6a 100644 --- a/orchestrator/src/server/services/crawler.ts +++ b/orchestrator/src/server/services/crawler.ts @@ -37,6 +37,11 @@ export interface RunCrawlerOptions { * List of search terms to be used as roles for URL generation. */ searchTerms?: string[]; + + /** + * Max jobs to fetch per search term. + */ + maxJobsPerTerm?: number; } interface JobExtractorProgress { @@ -84,6 +89,7 @@ export async function runCrawler(options: RunCrawlerOptions = {}): Promise