Add startup.jobs extractor support (#279)
* Add startup.jobs extractor support * Harden startup.jobs extractor inputs * Wire startupjobs into Docker and CI * Tighten startupjobs review follow-ups * fix: publish ghcr during release workflow * feat: add startupjobs max jobs configuration and update related tests
This commit is contained in:
parent
26275e4ee8
commit
71e640b563
1
.github/workflows/ci.yml
vendored
1
.github/workflows/ci.yml
vendored
@ -54,6 +54,7 @@ jobs:
|
||||
- adzuna-extractor
|
||||
- hiringcafe-extractor
|
||||
- gradcracker-extractor
|
||||
- startupjobs-extractor
|
||||
- ukvisajobs-extractor
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
|
||||
49
.github/workflows/release.yml
vendored
49
.github/workflows/release.yml
vendored
@ -7,9 +7,14 @@ on:
|
||||
description: "Next release version (x.y.z)"
|
||||
required: true
|
||||
type: string
|
||||
release_title:
|
||||
description: "Optional release title shown on GitHub (defaults to vX.Y.Z)"
|
||||
required: false
|
||||
type: string
|
||||
|
||||
permissions:
|
||||
contents: write
|
||||
packages: write
|
||||
|
||||
concurrency:
|
||||
group: release-${{ inputs.version }}
|
||||
@ -83,8 +88,50 @@ jobs:
|
||||
git tag "v$RELEASE_VERSION"
|
||||
git push origin "v$RELEASE_VERSION"
|
||||
|
||||
- name: Set up QEMU
|
||||
uses: docker/setup-qemu-action@v3
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@v3
|
||||
|
||||
- name: Log in to GHCR
|
||||
uses: docker/login-action@v3
|
||||
with:
|
||||
registry: ghcr.io
|
||||
username: ${{ github.actor }}
|
||||
password: ${{ secrets.GITHUB_TOKEN }}
|
||||
|
||||
- name: Docker meta (tags/labels)
|
||||
id: docker-meta
|
||||
uses: docker/metadata-action@v5
|
||||
with:
|
||||
images: ghcr.io/${{ github.repository_owner }}/job-ops
|
||||
tags: |
|
||||
type=raw,value=v${{ inputs.version }}
|
||||
type=raw,value=latest
|
||||
type=sha
|
||||
|
||||
- name: Build and push GHCR image
|
||||
uses: docker/build-push-action@v6
|
||||
with:
|
||||
context: .
|
||||
file: ./Dockerfile
|
||||
push: true
|
||||
platforms: linux/amd64,linux/arm64
|
||||
tags: ${{ steps.docker-meta.outputs.tags }}
|
||||
labels: ${{ steps.docker-meta.outputs.labels }}
|
||||
cache-from: type=gha
|
||||
cache-to: type=gha,mode=max
|
||||
|
||||
- name: Create GitHub release
|
||||
env:
|
||||
GH_TOKEN: ${{ github.token }}
|
||||
RELEASE_VERSION: ${{ inputs.version }}
|
||||
run: gh release create "v$RELEASE_VERSION" --title "v$RELEASE_VERSION" --generate-notes
|
||||
INPUT_RELEASE_TITLE: ${{ inputs.release_title }}
|
||||
run: |
|
||||
RELEASE_TITLE="$(printf '%s' "$INPUT_RELEASE_TITLE" | sed 's/^[[:space:]]*//;s/[[:space:]]*$//')"
|
||||
if [ -z "$RELEASE_TITLE" ]; then
|
||||
RELEASE_TITLE="v$RELEASE_VERSION"
|
||||
fi
|
||||
|
||||
gh release create "v$RELEASE_VERSION" --title "$RELEASE_TITLE" --generate-notes
|
||||
|
||||
@ -60,7 +60,8 @@ Releases are driven from GitHub Actions.
|
||||
|
||||
1. Open the `release` workflow in GitHub Actions.
|
||||
2. Enter the next version as `x.y.z` (for example `0.1.30`).
|
||||
3. Run the workflow.
|
||||
3. Optionally enter a separate release title for GitHub (for example `Google Dorks!`).
|
||||
4. Run the workflow.
|
||||
|
||||
The workflow will:
|
||||
|
||||
@ -68,9 +69,10 @@ The workflow will:
|
||||
- update `package-lock.json`
|
||||
- commit the version bump to `main`
|
||||
- create and push tag `vX.Y.Z`
|
||||
- create the GitHub release
|
||||
- publish the `ghcr.io/.../job-ops` image for that release
|
||||
- create the GitHub release using either the custom title or `vX.Y.Z`
|
||||
|
||||
The app version shown in the UI is sourced from `orchestrator/package.json`, so the release version, tag, and displayed app version stay aligned.
|
||||
The app version shown in the UI is sourced from `orchestrator/package.json`, so the release version, tag, and displayed app version stay aligned even when the GitHub release title is customized separately.
|
||||
|
||||
## Validation Before PR (CI-Parity Checks)
|
||||
|
||||
|
||||
@ -38,6 +38,7 @@ COPY orchestrator/package*.json ./orchestrator/
|
||||
COPY extractors/adzuna/package*.json ./extractors/adzuna/
|
||||
COPY extractors/hiringcafe/package*.json ./extractors/hiringcafe/
|
||||
COPY extractors/gradcracker/package*.json ./extractors/gradcracker/
|
||||
COPY extractors/startupjobs/package*.json ./extractors/startupjobs/
|
||||
COPY extractors/ukvisajobs/package*.json ./extractors/ukvisajobs/
|
||||
|
||||
# Install Node dependencies with npm cache (dev deps needed for build)
|
||||
@ -59,6 +60,7 @@ COPY extractors/adzuna ./extractors/adzuna
|
||||
COPY extractors/hiringcafe ./extractors/hiringcafe
|
||||
COPY extractors/gradcracker ./extractors/gradcracker
|
||||
COPY extractors/jobspy ./extractors/jobspy
|
||||
COPY extractors/startupjobs ./extractors/startupjobs
|
||||
COPY extractors/ukvisajobs ./extractors/ukvisajobs
|
||||
|
||||
# Build documentation site bundle
|
||||
@ -105,6 +107,7 @@ COPY orchestrator/package*.json ./orchestrator/
|
||||
COPY extractors/adzuna/package*.json ./extractors/adzuna/
|
||||
COPY extractors/hiringcafe/package*.json ./extractors/hiringcafe/
|
||||
COPY extractors/gradcracker/package*.json ./extractors/gradcracker/
|
||||
COPY extractors/startupjobs/package*.json ./extractors/startupjobs/
|
||||
COPY extractors/ukvisajobs/package*.json ./extractors/ukvisajobs/
|
||||
|
||||
# Install production Node dependencies only
|
||||
@ -122,6 +125,7 @@ COPY extractors/adzuna ./extractors/adzuna
|
||||
COPY extractors/hiringcafe ./extractors/hiringcafe
|
||||
COPY extractors/gradcracker ./extractors/gradcracker
|
||||
COPY extractors/jobspy ./extractors/jobspy
|
||||
COPY extractors/startupjobs ./extractors/startupjobs
|
||||
COPY extractors/ukvisajobs ./extractors/ukvisajobs
|
||||
|
||||
# Reuse Camoufox binaries from builder instead of fetching again
|
||||
|
||||
@ -17,6 +17,7 @@ Extractor integrations are now registered through manifests and loaded automatic
|
||||
| [JobSpy](/docs/next/extractors/jobspy) | Multi-source discovery (Indeed, LinkedIn, Glassdoor) | Requires Python wrapper execution per term; source availability and quality vary by site/location | `JOBSPY_SITES`, `JOBSPY_SEARCH_TERMS`, `JOBSPY_RESULTS_WANTED`, `JOBSPY_HOURS_OLD`, `JOBSPY_LINKEDIN_FETCH_DESCRIPTION` | Produces JSON per term, then orchestrator normalizes and de-duplicates by `jobUrl` |
|
||||
| [Adzuna](/docs/next/extractors/adzuna) | API-based multi-country discovery with low scraping overhead | Requires valid App ID/App Key; country must be in Adzuna-supported list | `ADZUNA_APP_ID`, `ADZUNA_APP_KEY`, `ADZUNA_MAX_JOBS_PER_TERM` | API pagination to dataset output; orchestrator maps progress and de-duplicates by `sourceJobId`/`jobUrl` |
|
||||
| [Hiring Cafe](/docs/next/extractors/hiring-cafe) | Browser-backed discovery using Hiring Cafe search APIs | Subject to upstream anti-bot checks; uses browser context and encoded search-state payloads | `HIRING_CAFE_SEARCH_TERMS`, `HIRING_CAFE_COUNTRY`, `HIRING_CAFE_MAX_JOBS_PER_TERM`, `HIRING_CAFE_DATE_FETCHED_PAST_N_DAYS` | Uses existing pipeline term/country/budget knobs and maps directly to normalized jobs |
|
||||
| [startup.jobs](/docs/next/extractors/startup-jobs) | Startup-focused discovery through the published `startup-jobs-scraper` package | No credentials required; detail enrichment depends on Playwright browser binaries being installed | existing pipeline `searchTerms`, selected country/cities, `jobspyResultsWanted`; `npx playwright install` for fresh environments | Algolia-backed search plus detail-page enrichment via package import; orchestrator maps normalized records and de-duplicates by `jobUrl` |
|
||||
| [UKVisaJobs](/docs/next/extractors/ukvisajobs) | UK visa sponsorship-focused roles | Requires authenticated session and periodic token/cookie refresh | `UKVISAJOBS_EMAIL`, `UKVISAJOBS_PASSWORD`, `UKVISAJOBS_MAX_JOBS`, `UKVISAJOBS_SEARCH_KEYWORD` | API pagination + dataset output; orchestrator de-dupes and may fetch missing descriptions |
|
||||
| [Manual Import](/docs/next/extractors/manual) | One-off jobs not covered by scrapers | Inference quality depends on model/provider and input quality; some URLs cannot be fetched reliably | App/API endpoints (`/api/manual-jobs/infer`, `/api/manual-jobs/import`) | Accepts text/HTML/URL, runs inference, then saves and scores job after review |
|
||||
|
||||
@ -25,6 +26,7 @@ Extractor integrations are now registered through manifests and loaded automatic
|
||||
- Use **JobSpy** for broad first-pass sourcing across common boards.
|
||||
- Use **Adzuna** when you want API-first discovery in supported non-UK markets.
|
||||
- Use **Hiring Cafe** when you want another term/country-driven source without adding credentials.
|
||||
- Use **startup.jobs** when you want startup-heavy listings without maintaining another scraper locally.
|
||||
- Use **Gradcracker** when targeting graduate pipelines in the UK.
|
||||
- Use **UKVisaJobs** for sponsorship-specific UK searches.
|
||||
- Use **Manual Import** when you already have a specific posting and need direct import.
|
||||
@ -37,6 +39,7 @@ Many runs combine sources: broad discovery first, then manual import for high-pr
|
||||
- [JobSpy](/docs/next/extractors/jobspy)
|
||||
- [Adzuna](/docs/next/extractors/adzuna)
|
||||
- [Hiring Cafe](/docs/next/extractors/hiring-cafe)
|
||||
- [startup.jobs](/docs/next/extractors/startup-jobs)
|
||||
- [UKVisaJobs](/docs/next/extractors/ukvisajobs)
|
||||
- [Manual Import](/docs/next/extractors/manual)
|
||||
- [Add an Extractor](/docs/next/workflows/add-an-extractor)
|
||||
|
||||
64
docs-site/docs/extractors/startup-jobs.md
Normal file
64
docs-site/docs/extractors/startup-jobs.md
Normal file
@ -0,0 +1,64 @@
|
||||
---
|
||||
id: startup-jobs
|
||||
title: startup.jobs Extractor
|
||||
description: startup.jobs extraction integrated through the startup-jobs-scraper package.
|
||||
sidebar_position: 8
|
||||
---
|
||||
|
||||
## What it is
|
||||
|
||||
Original website: [startup.jobs](https://startup.jobs)
|
||||
|
||||
This extractor wraps the published [`startup-jobs-scraper`](https://www.npmjs.com/package/startup-jobs-scraper) package and feeds normalized startup.jobs listings into the existing pipeline.
|
||||
|
||||
Implementation split:
|
||||
|
||||
1. `extractors/startupjobs/src/run.ts` calls `scrapeStartupJobsViaAlgolia` and maps package records into `CreateJobInput`.
|
||||
2. `extractors/startupjobs/src/manifest.ts` adapts pipeline settings, emits progress updates, and registers the source for runtime discovery.
|
||||
|
||||
## Why it exists
|
||||
|
||||
startup.jobs adds a startup-focused board to job-ops without introducing another bespoke scraper in this repository.
|
||||
|
||||
Using the published package also keeps the integration small and makes it easier to evolve the scraping logic independently from the app.
|
||||
|
||||
## How to use it
|
||||
|
||||
1. Open **Run jobs** and choose **Automatic**.
|
||||
2. Leave **startup.jobs** enabled in **Sources** or toggle it on.
|
||||
3. Set your usual automatic run controls:
|
||||
- `searchTerms` are sent as `query`.
|
||||
- country or city filters are reused as the package `location` option.
|
||||
- run budget path (`jobspyResultsWanted`) is reused as `requestedCount` per term.
|
||||
4. Start the run and monitor progress in the pipeline progress card.
|
||||
|
||||
Defaults and constraints:
|
||||
|
||||
- No new credentials are required.
|
||||
- The integration runs with `enrichDetails: true`, so it opens job detail pages for richer records.
|
||||
- Browser binaries are not downloaded automatically with the package. Install them with `npx playwright install` before using this extractor in a fresh environment.
|
||||
- When **Search cities** is set, the extractor runs once per city and once per search term.
|
||||
- Without explicit cities, the selected country is used as the location filter except for broad modes such as `worldwide` and `usa/ca`.
|
||||
|
||||
## Common problems
|
||||
|
||||
### startup.jobs does not appear in sources
|
||||
|
||||
- Check that the app is running a build that includes the new extractor manifest.
|
||||
- This source does not require credentials, so it should appear as soon as the updated build is loaded.
|
||||
|
||||
### Results are broader than expected
|
||||
|
||||
- If no city is configured, the extractor uses the selected country when possible and otherwise falls back to a broad search.
|
||||
- Add **Search cities** when you want tighter geographic filtering.
|
||||
|
||||
### Job descriptions are missing
|
||||
|
||||
- Detail enrichment depends on Playwright browser binaries being installed locally.
|
||||
- Run `npx playwright install` and retry if the extractor cannot open job detail pages.
|
||||
|
||||
## Related pages
|
||||
|
||||
- [Extractors Overview](/docs/next/extractors/overview)
|
||||
- [Pipeline Run](/docs/next/features/pipeline-run)
|
||||
- [Add an Extractor](/docs/next/workflows/add-an-extractor)
|
||||
@ -49,6 +49,7 @@ const sidebars: SidebarsConfig = {
|
||||
"extractors/jobspy",
|
||||
"extractors/adzuna",
|
||||
"extractors/hiring-cafe",
|
||||
"extractors/startup-jobs",
|
||||
"extractors/manual",
|
||||
"extractors/ukvisajobs",
|
||||
],
|
||||
|
||||
10
extractors/startupjobs/README.md
Normal file
10
extractors/startupjobs/README.md
Normal file
@ -0,0 +1,10 @@
|
||||
# startup.jobs Extractor
|
||||
|
||||
Extractor wrapper around the published `startup-jobs-scraper` package.
|
||||
|
||||
## Notes
|
||||
|
||||
- Uses `scrapeStartupJobsViaAlgolia` directly from `startup-jobs-scraper`.
|
||||
- Runs with `enrichDetails: true` so job descriptions and other detail-page fields are fetched during pipeline runs.
|
||||
- Browser binaries are not downloaded automatically. Install them with `npx playwright install` or `npm --workspace startupjobs-extractor run get-binaries`.
|
||||
- Reuses the pipeline's existing search terms, country, city, and budget controls.
|
||||
17
extractors/startupjobs/package.json
Normal file
17
extractors/startupjobs/package.json
Normal file
@ -0,0 +1,17 @@
|
||||
{
|
||||
"name": "startupjobs-extractor",
|
||||
"version": "0.0.1",
|
||||
"type": "module",
|
||||
"description": "startup.jobs extractor backed by the startup-jobs-scraper package",
|
||||
"dependencies": {
|
||||
"startup-jobs-scraper": "^0.1.0"
|
||||
},
|
||||
"devDependencies": {
|
||||
"@types/node": "^24.0.0",
|
||||
"typescript": "~5.9.0"
|
||||
},
|
||||
"scripts": {
|
||||
"check:types": "tsc --noEmit",
|
||||
"get-binaries": "npx playwright install"
|
||||
}
|
||||
}
|
||||
89
extractors/startupjobs/src/manifest.ts
Normal file
89
extractors/startupjobs/src/manifest.ts
Normal file
@ -0,0 +1,89 @@
|
||||
import { resolveSearchCities } from "@shared/search-cities.js";
|
||||
import type {
|
||||
ExtractorManifest,
|
||||
ExtractorProgressEvent,
|
||||
} from "@shared/types/extractors";
|
||||
import { runStartupJobs } from "./run";
|
||||
|
||||
function toProgress(event: {
|
||||
type: string;
|
||||
termIndex: number;
|
||||
termTotal: number;
|
||||
searchTerm: string;
|
||||
location?: string;
|
||||
jobsFoundTerm?: number;
|
||||
}): ExtractorProgressEvent {
|
||||
const scope = event.location
|
||||
? `${event.searchTerm} @ ${event.location}`
|
||||
: event.searchTerm;
|
||||
|
||||
if (event.type === "term_start") {
|
||||
return {
|
||||
phase: "list",
|
||||
termsProcessed: Math.max(event.termIndex - 1, 0),
|
||||
termsTotal: event.termTotal,
|
||||
currentUrl: scope,
|
||||
detail: `startup.jobs: term ${event.termIndex}/${event.termTotal} (${scope})`,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
phase: "list",
|
||||
termsProcessed: event.termIndex,
|
||||
termsTotal: event.termTotal,
|
||||
currentUrl: scope,
|
||||
jobPagesProcessed: event.jobsFoundTerm ?? 0,
|
||||
jobPagesEnqueued: event.jobsFoundTerm ?? 0,
|
||||
detail: `startup.jobs: completed ${event.termIndex}/${event.termTotal} (${scope}) with ${event.jobsFoundTerm ?? 0} jobs`,
|
||||
};
|
||||
}
|
||||
|
||||
export const manifest: ExtractorManifest = {
|
||||
id: "startupjobs",
|
||||
displayName: "startup.jobs",
|
||||
providesSources: ["startupjobs"],
|
||||
async run(context) {
|
||||
if (context.shouldCancel?.()) {
|
||||
return { success: true, jobs: [] };
|
||||
}
|
||||
|
||||
const parsedMaxJobsPerTerm = context.settings.startupjobsMaxJobsPerTerm
|
||||
? Number.parseInt(context.settings.startupjobsMaxJobsPerTerm, 10)
|
||||
: context.settings.jobspyResultsWanted
|
||||
? Number.parseInt(context.settings.jobspyResultsWanted, 10)
|
||||
: Number.NaN;
|
||||
const maxJobsPerTerm = Number.isFinite(parsedMaxJobsPerTerm)
|
||||
? Math.max(1, parsedMaxJobsPerTerm)
|
||||
: 50;
|
||||
|
||||
const result = await runStartupJobs({
|
||||
selectedCountry: context.selectedCountry,
|
||||
searchTerms: context.searchTerms,
|
||||
locations: resolveSearchCities({
|
||||
single:
|
||||
context.settings.searchCities ?? context.settings.jobspyLocation,
|
||||
}),
|
||||
maxJobsPerTerm,
|
||||
shouldCancel: context.shouldCancel,
|
||||
onProgress: (event) => {
|
||||
if (context.shouldCancel?.()) return;
|
||||
context.onProgress?.(toProgress(event));
|
||||
},
|
||||
});
|
||||
|
||||
if (!result.success) {
|
||||
return {
|
||||
success: false,
|
||||
jobs: [],
|
||||
error: result.error,
|
||||
};
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
jobs: result.jobs,
|
||||
};
|
||||
},
|
||||
};
|
||||
|
||||
export default manifest;
|
||||
198
extractors/startupjobs/src/run.ts
Normal file
198
extractors/startupjobs/src/run.ts
Normal file
@ -0,0 +1,198 @@
|
||||
import {
|
||||
formatCountryLabel,
|
||||
normalizeCountryKey,
|
||||
} from "@shared/location-support.js";
|
||||
import { resolveSearchCities } from "@shared/search-cities.js";
|
||||
import type { CreateJobInput } from "@shared/types/jobs";
|
||||
import {
|
||||
type StartupJobRecord,
|
||||
scrapeStartupJobsViaAlgolia,
|
||||
} from "startup-jobs-scraper";
|
||||
|
||||
export type StartupJobsProgressEvent =
|
||||
| {
|
||||
type: "term_start";
|
||||
termIndex: number;
|
||||
termTotal: number;
|
||||
searchTerm: string;
|
||||
location?: string;
|
||||
}
|
||||
| {
|
||||
type: "term_complete";
|
||||
termIndex: number;
|
||||
termTotal: number;
|
||||
searchTerm: string;
|
||||
location?: string;
|
||||
jobsFoundTerm: number;
|
||||
};
|
||||
|
||||
export interface RunStartupJobsOptions {
|
||||
searchTerms?: string[];
|
||||
selectedCountry?: string;
|
||||
locations?: string[];
|
||||
maxJobsPerTerm?: number;
|
||||
onProgress?: (event: StartupJobsProgressEvent) => void;
|
||||
shouldCancel?: () => boolean;
|
||||
}
|
||||
|
||||
export interface StartupJobsResult {
|
||||
success: boolean;
|
||||
jobs: CreateJobInput[];
|
||||
error?: string;
|
||||
}
|
||||
|
||||
function toPositiveIntOrFallback(
|
||||
value: number | string | undefined,
|
||||
fallback: number,
|
||||
): number {
|
||||
const parsed =
|
||||
typeof value === "number"
|
||||
? value
|
||||
: typeof value === "string"
|
||||
? Number.parseInt(value, 10)
|
||||
: Number.NaN;
|
||||
|
||||
if (!Number.isFinite(parsed)) return fallback;
|
||||
return Math.max(1, Math.floor(parsed));
|
||||
}
|
||||
|
||||
function inferJobType(disciplines: string | undefined): string | undefined {
|
||||
if (!disciplines) return undefined;
|
||||
const segments = disciplines
|
||||
.split("|")
|
||||
.map((value) => value.trim())
|
||||
.filter(Boolean);
|
||||
return segments.length > 1 ? segments[segments.length - 1] : undefined;
|
||||
}
|
||||
|
||||
function mapStartupJob(row: StartupJobRecord): CreateJobInput | null {
|
||||
if (!row.jobUrl) return null;
|
||||
|
||||
return {
|
||||
source: "startupjobs",
|
||||
title: row.title || "Unknown Title",
|
||||
employer: row.employer || "Unknown Employer",
|
||||
employerUrl: row.employerUrl || undefined,
|
||||
jobUrl: row.jobUrl,
|
||||
applicationLink: row.applicationLink || row.jobUrl,
|
||||
disciplines: row.disciplines || undefined,
|
||||
deadline: row.deadline || undefined,
|
||||
salary: row.salary || undefined,
|
||||
location: row.location || undefined,
|
||||
degreeRequired: row.degreeRequired || undefined,
|
||||
starting: row.starting || undefined,
|
||||
jobDescription: row.jobDescription || undefined,
|
||||
jobType: inferJobType(row.disciplines),
|
||||
isRemote: row.location?.toLowerCase().includes("remote") ?? undefined,
|
||||
};
|
||||
}
|
||||
|
||||
function resolveRunLocations(args: {
|
||||
selectedCountry?: string;
|
||||
locations?: string[];
|
||||
}): Array<string | null> {
|
||||
const locations = resolveSearchCities({
|
||||
list: args.locations,
|
||||
});
|
||||
|
||||
const normalizedLocations = locations
|
||||
.map((location) => normalizeCountryKey(location))
|
||||
.filter((location) => location !== "worldwide" && location !== "usa/ca");
|
||||
|
||||
if (normalizedLocations.length > 0) {
|
||||
return normalizedLocations.map((location) => formatCountryLabel(location));
|
||||
}
|
||||
|
||||
const countryKey = normalizeCountryKey(args.selectedCountry);
|
||||
if (!countryKey || countryKey === "worldwide" || countryKey === "usa/ca") {
|
||||
return [null];
|
||||
}
|
||||
|
||||
return [formatCountryLabel(countryKey)];
|
||||
}
|
||||
|
||||
export async function runStartupJobs(
|
||||
options: RunStartupJobsOptions = {},
|
||||
): Promise<StartupJobsResult> {
|
||||
const searchTerms =
|
||||
options.searchTerms && options.searchTerms.length > 0
|
||||
? options.searchTerms
|
||||
: ["software engineer"];
|
||||
const runLocations = resolveRunLocations({
|
||||
selectedCountry: options.selectedCountry,
|
||||
locations: options.locations,
|
||||
});
|
||||
const maxJobsPerTerm = toPositiveIntOrFallback(options.maxJobsPerTerm, 50);
|
||||
const termTotal = searchTerms.length * runLocations.length;
|
||||
const jobs: CreateJobInput[] = [];
|
||||
const seen = new Set<string>();
|
||||
let runIndex = 0;
|
||||
|
||||
try {
|
||||
for (const location of runLocations) {
|
||||
for (const searchTerm of searchTerms) {
|
||||
runIndex += 1;
|
||||
if (options.shouldCancel?.()) {
|
||||
return { success: true, jobs };
|
||||
}
|
||||
|
||||
options.onProgress?.({
|
||||
type: "term_start",
|
||||
termIndex: runIndex,
|
||||
termTotal,
|
||||
searchTerm,
|
||||
location: location ?? undefined,
|
||||
});
|
||||
|
||||
const records = await scrapeStartupJobsViaAlgolia({
|
||||
query: searchTerm,
|
||||
requestedCount: maxJobsPerTerm,
|
||||
enrichDetails: true,
|
||||
location: location ?? undefined,
|
||||
});
|
||||
|
||||
let jobsFoundTerm = 0;
|
||||
for (const record of records) {
|
||||
const mapped = mapStartupJob(record);
|
||||
if (!mapped) continue;
|
||||
const dedupeKey = mapped.jobUrl;
|
||||
if (seen.has(dedupeKey)) continue;
|
||||
seen.add(dedupeKey);
|
||||
jobs.push(mapped);
|
||||
jobsFoundTerm += 1;
|
||||
}
|
||||
|
||||
options.onProgress?.({
|
||||
type: "term_complete",
|
||||
termIndex: runIndex,
|
||||
termTotal,
|
||||
searchTerm,
|
||||
location: location ?? undefined,
|
||||
jobsFoundTerm,
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
return {
|
||||
success: true,
|
||||
jobs,
|
||||
};
|
||||
} catch (error) {
|
||||
const message =
|
||||
error instanceof Error
|
||||
? error.message
|
||||
: typeof error === "string"
|
||||
? error
|
||||
: "Unexpected error while running startup.jobs extractor.";
|
||||
const missingBrowser =
|
||||
/playwright|browser|executable/i.test(message) &&
|
||||
/install/i.test(message);
|
||||
return {
|
||||
success: false,
|
||||
jobs: [],
|
||||
error: missingBrowser
|
||||
? `${message}. Install browser binaries with 'npx playwright install'.`
|
||||
: message,
|
||||
};
|
||||
}
|
||||
}
|
||||
38
extractors/startupjobs/tests/manifest.test.ts
Normal file
38
extractors/startupjobs/tests/manifest.test.ts
Normal file
@ -0,0 +1,38 @@
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
vi.mock("../src/run", () => ({
|
||||
runStartupJobs: vi.fn(),
|
||||
}));
|
||||
|
||||
describe("startupjobs manifest", () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
it("prefers startupjobsMaxJobsPerTerm when provided", async () => {
|
||||
const { manifest } = await import("../src/manifest");
|
||||
const { runStartupJobs } = await import("../src/run");
|
||||
const runStartupJobsMock = vi.mocked(runStartupJobs);
|
||||
runStartupJobsMock.mockResolvedValue({
|
||||
success: true,
|
||||
jobs: [],
|
||||
});
|
||||
|
||||
await manifest.run({
|
||||
source: "startupjobs",
|
||||
selectedSources: ["startupjobs"],
|
||||
settings: {
|
||||
startupjobsMaxJobsPerTerm: "70",
|
||||
jobspyResultsWanted: "30",
|
||||
},
|
||||
searchTerms: ["software engineer"],
|
||||
selectedCountry: "united kingdom",
|
||||
});
|
||||
|
||||
expect(runStartupJobsMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
maxJobsPerTerm: 70,
|
||||
}),
|
||||
);
|
||||
});
|
||||
});
|
||||
75
extractors/startupjobs/tests/run.test.ts
Normal file
75
extractors/startupjobs/tests/run.test.ts
Normal file
@ -0,0 +1,75 @@
|
||||
import { beforeEach, describe, expect, it, vi } from "vitest";
|
||||
|
||||
vi.mock("startup-jobs-scraper", () => ({
|
||||
scrapeStartupJobsViaAlgolia: vi.fn(),
|
||||
}));
|
||||
|
||||
describe("runStartupJobs", () => {
|
||||
beforeEach(() => {
|
||||
vi.clearAllMocks();
|
||||
});
|
||||
|
||||
it("falls back to the default max jobs per term when options.maxJobsPerTerm is NaN", async () => {
|
||||
const { scrapeStartupJobsViaAlgolia } = await import(
|
||||
"startup-jobs-scraper"
|
||||
);
|
||||
const scrapeMock = vi.mocked(scrapeStartupJobsViaAlgolia);
|
||||
scrapeMock.mockResolvedValueOnce([]);
|
||||
|
||||
const { runStartupJobs } = await import("../src/run");
|
||||
|
||||
await runStartupJobs({
|
||||
searchTerms: ["backend engineer"],
|
||||
maxJobsPerTerm: Number.NaN,
|
||||
});
|
||||
|
||||
expect(scrapeMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
requestedCount: 50,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("drops broad location sentinels and falls back to selectedCountry behavior", async () => {
|
||||
const { scrapeStartupJobsViaAlgolia } = await import(
|
||||
"startup-jobs-scraper"
|
||||
);
|
||||
const scrapeMock = vi.mocked(scrapeStartupJobsViaAlgolia);
|
||||
scrapeMock.mockResolvedValueOnce([]);
|
||||
|
||||
const { runStartupJobs } = await import("../src/run");
|
||||
|
||||
await runStartupJobs({
|
||||
searchTerms: ["platform engineer"],
|
||||
selectedCountry: "worldwide",
|
||||
locations: ["Worldwide"],
|
||||
});
|
||||
|
||||
expect(scrapeMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
location: undefined,
|
||||
}),
|
||||
);
|
||||
});
|
||||
|
||||
it("normalizes explicit city-country aliases before passing location to the scraper", async () => {
|
||||
const { scrapeStartupJobsViaAlgolia } = await import(
|
||||
"startup-jobs-scraper"
|
||||
);
|
||||
const scrapeMock = vi.mocked(scrapeStartupJobsViaAlgolia);
|
||||
scrapeMock.mockResolvedValueOnce([]);
|
||||
|
||||
const { runStartupJobs } = await import("../src/run");
|
||||
|
||||
await runStartupJobs({
|
||||
searchTerms: ["software engineer"],
|
||||
locations: ["UK"],
|
||||
});
|
||||
|
||||
expect(scrapeMock).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
location: "United Kingdom",
|
||||
}),
|
||||
);
|
||||
});
|
||||
});
|
||||
16
extractors/startupjobs/tsconfig.json
Normal file
16
extractors/startupjobs/tsconfig.json
Normal file
@ -0,0 +1,16 @@
|
||||
{
|
||||
"compilerOptions": {
|
||||
"module": "ESNext",
|
||||
"moduleResolution": "bundler",
|
||||
"target": "ES2022",
|
||||
"strict": true,
|
||||
"noUnusedLocals": false,
|
||||
"lib": ["ES2022", "DOM"],
|
||||
"types": ["node"],
|
||||
"baseUrl": ".",
|
||||
"paths": {
|
||||
"@shared/*": ["../../shared/src/*"]
|
||||
}
|
||||
},
|
||||
"include": ["./src/**/*"]
|
||||
}
|
||||
@ -753,6 +753,7 @@ describe("OrchestratorPage", () => {
|
||||
gradcrackerMaxJobsPerTerm: 150,
|
||||
ukvisajobsMaxJobs: 150,
|
||||
adzunaMaxJobsPerTerm: 150,
|
||||
startupjobsMaxJobsPerTerm: 150,
|
||||
jobspyCountryIndeed: "united kingdom",
|
||||
searchCities: "United Kingdom",
|
||||
});
|
||||
|
||||
@ -182,6 +182,7 @@ export const AutomaticRunTab: React.FC<AutomaticRunTabProps> = ({
|
||||
|
||||
const rememberedRunBudget =
|
||||
settings?.jobspyResultsWanted?.value ??
|
||||
settings?.startupjobsMaxJobsPerTerm?.value ??
|
||||
settings?.adzunaMaxJobsPerTerm?.value ??
|
||||
settings?.gradcrackerMaxJobsPerTerm?.value ??
|
||||
settings?.ukvisajobsMaxJobs?.value ??
|
||||
|
||||
@ -52,6 +52,17 @@ describe("automatic-run utilities", () => {
|
||||
expect(cap).toBeLessThanOrEqual(750);
|
||||
});
|
||||
|
||||
it("assigns a dedicated startupjobs max-jobs limit", () => {
|
||||
const limits = deriveExtractorLimits({
|
||||
budget: 120,
|
||||
searchTerms: ["backend", "platform"],
|
||||
sources: ["startupjobs"],
|
||||
});
|
||||
|
||||
expect(limits.startupjobsMaxJobsPerTerm).toBeGreaterThan(0);
|
||||
expect(limits.startupjobsMaxJobsPerTerm).toBeLessThanOrEqual(120);
|
||||
});
|
||||
|
||||
it("returns zero estimate when no search terms are provided", () => {
|
||||
const estimate = calculateAutomaticEstimate({
|
||||
values: {
|
||||
@ -112,4 +123,21 @@ describe("automatic-run utilities", () => {
|
||||
expect(estimate.discovered.cap).toBeGreaterThan(0);
|
||||
expect(estimate.discovered.cap).toBeLessThanOrEqual(120);
|
||||
});
|
||||
|
||||
it("includes startupjobs in estimate caps using the shared term budget", () => {
|
||||
const estimate = calculateAutomaticEstimate({
|
||||
values: {
|
||||
topN: 10,
|
||||
minSuitabilityScore: 50,
|
||||
searchTerms: ["backend", "platform"],
|
||||
runBudget: 120,
|
||||
country: "united kingdom",
|
||||
cityLocations: [],
|
||||
},
|
||||
sources: ["startupjobs"],
|
||||
});
|
||||
|
||||
expect(estimate.discovered.cap).toBeGreaterThan(0);
|
||||
expect(estimate.discovered.cap).toBeLessThanOrEqual(120);
|
||||
});
|
||||
});
|
||||
|
||||
@ -66,6 +66,7 @@ export interface ExtractorLimits {
|
||||
gradcrackerMaxJobsPerTerm: number;
|
||||
ukvisajobsMaxJobs: number;
|
||||
adzunaMaxJobsPerTerm: number;
|
||||
startupjobsMaxJobsPerTerm: number;
|
||||
}
|
||||
|
||||
export function deriveExtractorLimits(args: {
|
||||
@ -82,6 +83,7 @@ export function deriveExtractorLimits(args: {
|
||||
const includesUkVisaJobs = args.sources.includes("ukvisajobs");
|
||||
const includesAdzuna = args.sources.includes("adzuna");
|
||||
const includesHiringCafe = args.sources.includes("hiringcafe");
|
||||
const includesStartupJobs = args.sources.includes("startupjobs");
|
||||
|
||||
const weightedContributors =
|
||||
(includesIndeed ? termCount : 0) +
|
||||
@ -90,7 +92,8 @@ export function deriveExtractorLimits(args: {
|
||||
(includesGradcracker ? termCount : 0) +
|
||||
(includesUkVisaJobs ? 1 : 0) +
|
||||
(includesAdzuna ? termCount : 0) +
|
||||
(includesHiringCafe ? termCount : 0);
|
||||
(includesHiringCafe ? termCount : 0) +
|
||||
(includesStartupJobs ? termCount : 0);
|
||||
|
||||
if (weightedContributors <= 0) {
|
||||
return {
|
||||
@ -98,6 +101,7 @@ export function deriveExtractorLimits(args: {
|
||||
gradcrackerMaxJobsPerTerm: budget,
|
||||
ukvisajobsMaxJobs: budget,
|
||||
adzunaMaxJobsPerTerm: budget,
|
||||
startupjobsMaxJobsPerTerm: budget,
|
||||
};
|
||||
}
|
||||
|
||||
@ -109,6 +113,7 @@ export function deriveExtractorLimits(args: {
|
||||
gradcrackerMaxJobsPerTerm: perUnit,
|
||||
ukvisajobsMaxJobs: Math.min(budget, perUnit + remainder),
|
||||
adzunaMaxJobsPerTerm: perUnit,
|
||||
startupjobsMaxJobsPerTerm: perUnit,
|
||||
};
|
||||
}
|
||||
|
||||
@ -173,6 +178,7 @@ export function calculateAutomaticEstimate(args: {
|
||||
const hasGlassdoor = sources.includes("glassdoor");
|
||||
const hasAdzuna = sources.includes("adzuna");
|
||||
const hasHiringCafe = sources.includes("hiringcafe");
|
||||
const hasStartupJobs = sources.includes("startupjobs");
|
||||
const limits = deriveExtractorLimits({
|
||||
budget: values.runBudget,
|
||||
searchTerms: values.searchTerms,
|
||||
@ -191,9 +197,17 @@ export function calculateAutomaticEstimate(args: {
|
||||
const hiringCafeCap = hasHiringCafe
|
||||
? limits.jobspyResultsWanted * termCount
|
||||
: 0;
|
||||
const startupJobsCap = hasStartupJobs
|
||||
? limits.startupjobsMaxJobsPerTerm * termCount
|
||||
: 0;
|
||||
|
||||
const discoveredCap =
|
||||
jobspyCap + gradcrackerCap + ukvisaCap + adzunaCap + hiringCafeCap;
|
||||
jobspyCap +
|
||||
gradcrackerCap +
|
||||
ukvisaCap +
|
||||
adzunaCap +
|
||||
hiringCafeCap +
|
||||
startupJobsCap;
|
||||
const discoveredMin = Math.round(discoveredCap * 0.35);
|
||||
const discoveredMax = Math.round(discoveredCap * 0.75);
|
||||
const processedMin = Math.min(values.topN, discoveredMin);
|
||||
|
||||
@ -181,11 +181,13 @@ export function usePipelineControls(
|
||||
);
|
||||
const hasAdzuna = compatibleSources.includes("adzuna");
|
||||
const hasHiringCafe = compatibleSources.includes("hiringcafe");
|
||||
const hasStartupJobs = compatibleSources.includes("startupjobs");
|
||||
const serializedCities = serializeCityLocationsSetting(
|
||||
values.cityLocations,
|
||||
);
|
||||
const searchCities =
|
||||
(hasJobSpySite || hasAdzuna || hasHiringCafe) && serializedCities
|
||||
(hasJobSpySite || hasAdzuna || hasHiringCafe || hasStartupJobs) &&
|
||||
serializedCities
|
||||
? serializedCities
|
||||
: formatCountryLabel(values.country);
|
||||
await api.updateSettings({
|
||||
@ -194,6 +196,7 @@ export function usePipelineControls(
|
||||
gradcrackerMaxJobsPerTerm: limits.gradcrackerMaxJobsPerTerm,
|
||||
ukvisajobsMaxJobs: limits.ukvisajobsMaxJobs,
|
||||
adzunaMaxJobsPerTerm: limits.adzunaMaxJobsPerTerm,
|
||||
startupjobsMaxJobsPerTerm: limits.startupjobsMaxJobsPerTerm,
|
||||
jobspyCountryIndeed: values.country,
|
||||
searchCities,
|
||||
});
|
||||
|
||||
@ -17,6 +17,10 @@ describe("orchestrator utils", () => {
|
||||
expect(getEnabledSources(withoutKey)).not.toContain("adzuna");
|
||||
});
|
||||
|
||||
it("enables startupjobs without credentials", () => {
|
||||
expect(getEnabledSources(createAppSettings())).toContain("startupjobs");
|
||||
});
|
||||
|
||||
it("counts processing jobs in ready and discovered tabs", () => {
|
||||
const jobs = [
|
||||
createJob({ id: "ready", status: "ready", closedAt: null }),
|
||||
|
||||
@ -195,6 +195,10 @@ export const getEnabledSources = (
|
||||
enabled.push(source);
|
||||
continue;
|
||||
}
|
||||
if (source === "startupjobs") {
|
||||
enabled.push(source);
|
||||
continue;
|
||||
}
|
||||
if (
|
||||
source === "indeed" ||
|
||||
source === "linkedin" ||
|
||||
|
||||
@ -254,6 +254,7 @@ export const DEMO_SOURCE_BASE_URLS: Record<JobSource, string> = {
|
||||
ukvisajobs: "https://www.ukvisajobs.com",
|
||||
adzuna: "https://www.adzuna.com",
|
||||
hiringcafe: "https://hiring.cafe",
|
||||
startupjobs: "https://startup.jobs",
|
||||
manual: "https://example.com",
|
||||
};
|
||||
|
||||
|
||||
650
package-lock.json
generated
650
package-lock.json
generated
File diff suppressed because it is too large
Load Diff
@ -8,6 +8,7 @@ export const EXTRACTOR_SOURCE_IDS = [
|
||||
"ukvisajobs",
|
||||
"adzuna",
|
||||
"hiringcafe",
|
||||
"startupjobs",
|
||||
"manual",
|
||||
] as const;
|
||||
|
||||
@ -48,6 +49,7 @@ export const EXTRACTOR_SOURCE_METADATA: Record<
|
||||
requiresCredentials: true,
|
||||
},
|
||||
hiringcafe: { label: "Hiring Cafe", order: 70, category: "pipeline" },
|
||||
startupjobs: { label: "startup.jobs", order: 80, category: "pipeline" },
|
||||
manual: { label: "Manual", order: 90, category: "manual" },
|
||||
};
|
||||
|
||||
|
||||
@ -55,6 +55,10 @@ describe("location-support", () => {
|
||||
expect(isSourceAllowedForCountry("glassdoor", "japan")).toBe(false);
|
||||
expect(isSourceAllowedForCountry("adzuna", "united states")).toBe(true);
|
||||
expect(isSourceAllowedForCountry("adzuna", "japan")).toBe(false);
|
||||
expect(isSourceAllowedForCountry("startupjobs", "united states")).toBe(
|
||||
true,
|
||||
);
|
||||
expect(isSourceAllowedForCountry("startupjobs", "worldwide")).toBe(true);
|
||||
});
|
||||
|
||||
it("filters incompatible sources while preserving compatible order", () => {
|
||||
@ -66,11 +70,12 @@ describe("location-support", () => {
|
||||
"glassdoor",
|
||||
"ukvisajobs",
|
||||
"adzuna",
|
||||
"startupjobs",
|
||||
"linkedin",
|
||||
],
|
||||
"united states",
|
||||
),
|
||||
).toEqual(["indeed", "glassdoor", "adzuna", "linkedin"]);
|
||||
).toEqual(["indeed", "glassdoor", "adzuna", "startupjobs", "linkedin"]);
|
||||
});
|
||||
|
||||
it("supports glassdoor only in explicitly supported countries", () => {
|
||||
|
||||
@ -217,6 +217,19 @@ export const settingsRegistry = {
|
||||
parse: parseIntOrNull,
|
||||
serialize: serializeNullableNumber,
|
||||
},
|
||||
startupjobsMaxJobsPerTerm: {
|
||||
kind: "typed" as const,
|
||||
schema: z.number().int().min(1).max(1000),
|
||||
default: (): number =>
|
||||
parseInt(
|
||||
typeof process !== "undefined"
|
||||
? process.env.STARTUPJOBS_MAX_RESULTS || "50"
|
||||
: "50",
|
||||
10,
|
||||
),
|
||||
parse: parseIntOrNull,
|
||||
serialize: serializeNullableNumber,
|
||||
},
|
||||
searchTerms: {
|
||||
kind: "typed" as const,
|
||||
schema: z.array(z.string().trim().min(1).max(200)).max(100),
|
||||
|
||||
@ -153,6 +153,7 @@ export const createAppSettings = (
|
||||
ukvisajobsMaxJobs: { value: 50, default: 50, override: null },
|
||||
adzunaMaxJobsPerTerm: { value: 50, default: 50, override: null },
|
||||
gradcrackerMaxJobsPerTerm: { value: 50, default: 50, override: null },
|
||||
startupjobsMaxJobsPerTerm: { value: 50, default: 50, override: null },
|
||||
searchTerms: {
|
||||
value: ["Software Engineer"],
|
||||
default: ["Software Engineer"],
|
||||
|
||||
@ -152,6 +152,7 @@ export interface AppSettings {
|
||||
ukvisajobsMaxJobs: Resolved<number>;
|
||||
adzunaMaxJobsPerTerm: Resolved<number>;
|
||||
gradcrackerMaxJobsPerTerm: Resolved<number>;
|
||||
startupjobsMaxJobsPerTerm: Resolved<number>;
|
||||
searchTerms: Resolved<string[]>;
|
||||
blockedCompanyKeywords: Resolved<string[]>;
|
||||
scoringInstructions: Resolved<string>;
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user