Some checks failed
CI / Linting (Biome) (push) Failing after 36s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m6s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m9s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m5s
CI / Type Check (orchestrator) (push) Successful in 1m21s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m4s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m4s
CI / Documentation (push) Successful in 1m52s
Adds extractor packages: arbeitnow, ashby, careerjet, fourdayweek,
greenhouse, himalayas, jobicy, jooble, lever, reed, remoteok, remotive,
themuse, usajobs, weworkremotely, workday — each with manifest, package
metadata and README.
Pipeline / shared:
- shared/job-fingerprint: stable hash for cross-source dedup, with tests
- discover-jobs: dedup via fingerprint and richer per-source merging
- jobs repository: fingerprint-aware upsert / lookup
- settings-registry, settings types/routes, demo-defaults: knobs for the
new sources
- shared extractors index: register the new manifests
- location-support, profiles route: small fixes for the new sources
Tooling:
- scripts/smoke-extractors.ts to sanity-check each source locally
- scripts/jobber-cron-{cherepaha,dobkin}.env.example: per-host cron
templates (CHANGEME placeholders only)
- .env.example: documented env vars for the new extractors
- .gitignore: ignore extractors/*/storage/ runtime caches (was ukvisajobs only)
Co-authored-by: Cursor <cursoragent@cursor.com>
191 lines
5.7 KiB
TypeScript
191 lines
5.7 KiB
TypeScript
/**
|
|
* Remote OK public feed.
|
|
*
|
|
* https://remoteok.com/api — single JSON endpoint that returns the entire
|
|
* active remote-jobs board in one shot. The first array element is metadata
|
|
* (legal/attribution); jobs follow.
|
|
*
|
|
* No auth, no server-side pagination, no per-term query — we fetch once per
|
|
* pipeline run and apply each `searchTerm` as a case-insensitive filter over
|
|
* `position` + `tags` so the orchestrator's per-term iteration still works.
|
|
*
|
|
* Per Remote OK's TOS we send a descriptive User-Agent so they can identify
|
|
* traffic; we do not strip the legal/attribution element from the response.
|
|
*/
|
|
|
|
import type {
|
|
ExtractorManifest,
|
|
ExtractorRunResult,
|
|
} from "@shared/types/extractors";
|
|
import type { CreateJobInput } from "@shared/types/jobs";
|
|
|
|
const API_URL = "https://remoteok.com/api";
|
|
const USER_AGENT =
|
|
"Mozilla/5.0 (compatible; JobOps/1.0; +https://github.com/) job-search pipeline";
|
|
|
|
interface RemoteOkJob {
|
|
id?: string | number;
|
|
slug?: string;
|
|
position?: string;
|
|
company?: string;
|
|
company_logo?: string;
|
|
logo?: string;
|
|
location?: string;
|
|
tags?: string[];
|
|
description?: string;
|
|
url?: string;
|
|
apply_url?: string;
|
|
date?: string;
|
|
epoch?: number;
|
|
salary_min?: number;
|
|
salary_max?: number;
|
|
}
|
|
|
|
interface RemoteOkLegalEntry {
|
|
legal?: string;
|
|
last_updated?: number;
|
|
}
|
|
|
|
type RemoteOkResponseEntry = RemoteOkJob | RemoteOkLegalEntry;
|
|
|
|
function asString(value: unknown): string | undefined {
|
|
if (typeof value !== "string") return undefined;
|
|
const trimmed = value.trim();
|
|
return trimmed ? trimmed : undefined;
|
|
}
|
|
|
|
function isJobEntry(entry: RemoteOkResponseEntry): entry is RemoteOkJob {
|
|
return (
|
|
"id" in entry || "position" in entry || "url" in entry || "slug" in entry
|
|
);
|
|
}
|
|
|
|
function tagMatchesTerm(job: RemoteOkJob, normalizedTerm: string): boolean {
|
|
if (!normalizedTerm) return true;
|
|
const haystack = [
|
|
job.position ?? "",
|
|
...(Array.isArray(job.tags) ? job.tags : []),
|
|
]
|
|
.join(" ")
|
|
.toLowerCase();
|
|
return haystack.includes(normalizedTerm);
|
|
}
|
|
|
|
function mapJob(job: RemoteOkJob): CreateJobInput | null {
|
|
const jobUrl = asString(job.url) ?? asString(job.apply_url);
|
|
if (!jobUrl) return null;
|
|
|
|
// Remote OK reports salary as raw numbers; 0 means "not specified".
|
|
const minSalary =
|
|
typeof job.salary_min === "number" && job.salary_min > 0
|
|
? job.salary_min
|
|
: undefined;
|
|
const maxSalary =
|
|
typeof job.salary_max === "number" && job.salary_max > 0
|
|
? job.salary_max
|
|
: undefined;
|
|
|
|
const tags = Array.isArray(job.tags)
|
|
? job.tags.filter((tag): tag is string => typeof tag === "string")
|
|
: [];
|
|
|
|
return {
|
|
source: "remoteok",
|
|
sourceJobId: job.id != null ? String(job.id) : asString(job.slug),
|
|
title: asString(job.position) ?? "Unknown Title",
|
|
employer: asString(job.company) ?? "Unknown Employer",
|
|
jobUrl,
|
|
applicationLink: asString(job.apply_url) ?? jobUrl,
|
|
location: asString(job.location) ?? "Remote",
|
|
isRemote: true,
|
|
datePosted: asString(job.date),
|
|
jobDescription: asString(job.description),
|
|
companyLogo: asString(job.company_logo) ?? asString(job.logo),
|
|
disciplines: tags.length > 0 ? tags.join(", ") : undefined,
|
|
salaryMinAmount: minSalary,
|
|
salaryMaxAmount: maxSalary,
|
|
salaryCurrency: minSalary || maxSalary ? "USD" : undefined,
|
|
salaryInterval: minSalary || maxSalary ? "yearly" : undefined,
|
|
};
|
|
}
|
|
|
|
async function fetchAll(): Promise<RemoteOkJob[]> {
|
|
const response = await fetch(API_URL, {
|
|
headers: {
|
|
Accept: "application/json",
|
|
"User-Agent": USER_AGENT,
|
|
},
|
|
});
|
|
if (!response.ok) {
|
|
throw new Error(`Remote OK request failed with status ${response.status}`);
|
|
}
|
|
const body = (await response.json()) as RemoteOkResponseEntry[];
|
|
if (!Array.isArray(body)) return [];
|
|
return body.filter(isJobEntry);
|
|
}
|
|
|
|
export const manifest: ExtractorManifest = {
|
|
id: "remoteok",
|
|
displayName: "Remote OK",
|
|
providesSources: ["remoteok"],
|
|
async run(context): Promise<ExtractorRunResult> {
|
|
if (context.shouldCancel?.()) return { success: true, jobs: [] };
|
|
|
|
const maxJobsPerTerm = context.settings.remoteokMaxJobsPerTerm
|
|
? Number.parseInt(context.settings.remoteokMaxJobsPerTerm, 10)
|
|
: 100;
|
|
|
|
const terms = context.searchTerms.length > 0 ? context.searchTerms : [""];
|
|
|
|
let allJobs: RemoteOkJob[];
|
|
try {
|
|
allJobs = await fetchAll();
|
|
} catch (error) {
|
|
const message = error instanceof Error ? error.message : "Unknown error";
|
|
return { success: false, jobs: [], error: message };
|
|
}
|
|
|
|
const seen = new Set<string>();
|
|
const out: CreateJobInput[] = [];
|
|
|
|
for (let i = 0; i < terms.length; i += 1) {
|
|
if (context.shouldCancel?.()) break;
|
|
const term = terms[i].trim();
|
|
const normalizedTerm = term.toLowerCase();
|
|
context.onProgress?.({
|
|
phase: "list",
|
|
termsProcessed: i,
|
|
termsTotal: terms.length,
|
|
currentUrl: term || "(all remote)",
|
|
detail: `Remote OK: term ${i + 1}/${terms.length}`,
|
|
});
|
|
|
|
let collected = 0;
|
|
for (const job of allJobs) {
|
|
if (collected >= maxJobsPerTerm) break;
|
|
if (!tagMatchesTerm(job, normalizedTerm)) continue;
|
|
const mapped = mapJob(job);
|
|
if (!mapped) continue;
|
|
const key = mapped.sourceJobId || mapped.jobUrl;
|
|
if (seen.has(key)) continue;
|
|
seen.add(key);
|
|
out.push(mapped);
|
|
collected += 1;
|
|
}
|
|
|
|
context.onProgress?.({
|
|
phase: "list",
|
|
termsProcessed: i + 1,
|
|
termsTotal: terms.length,
|
|
currentUrl: term || "(all remote)",
|
|
jobPagesProcessed: out.length,
|
|
detail: `Remote OK: completed term ${i + 1}/${terms.length} (${collected} matched)`,
|
|
});
|
|
}
|
|
|
|
return { success: true, jobs: out };
|
|
},
|
|
};
|
|
|
|
export default manifest;
|