Jobber/extractors/remoteok/manifest.ts
ilia 7b3dfb002a
Some checks failed
CI / Linting (Biome) (push) Failing after 36s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m6s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m9s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m5s
CI / Type Check (orchestrator) (push) Successful in 1m21s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m4s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m4s
CI / Documentation (push) Successful in 1m52s
feat(extractors): add 17 job source extractors and cross-source dedup
Adds extractor packages: arbeitnow, ashby, careerjet, fourdayweek,
greenhouse, himalayas, jobicy, jooble, lever, reed, remoteok, remotive,
themuse, usajobs, weworkremotely, workday — each with manifest, package
metadata and README.

Pipeline / shared:
- shared/job-fingerprint: stable hash for cross-source dedup, with tests
- discover-jobs: dedup via fingerprint and richer per-source merging
- jobs repository: fingerprint-aware upsert / lookup
- settings-registry, settings types/routes, demo-defaults: knobs for the
  new sources
- shared extractors index: register the new manifests
- location-support, profiles route: small fixes for the new sources

Tooling:
- scripts/smoke-extractors.ts to sanity-check each source locally
- scripts/jobber-cron-{cherepaha,dobkin}.env.example: per-host cron
  templates (CHANGEME placeholders only)
- .env.example: documented env vars for the new extractors
- .gitignore: ignore extractors/*/storage/ runtime caches (was ukvisajobs only)

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-12 20:17:52 -04:00

191 lines
5.7 KiB
TypeScript

/**
* Remote OK public feed.
*
* https://remoteok.com/api — single JSON endpoint that returns the entire
* active remote-jobs board in one shot. The first array element is metadata
* (legal/attribution); jobs follow.
*
* No auth, no server-side pagination, no per-term query — we fetch once per
* pipeline run and apply each `searchTerm` as a case-insensitive filter over
* `position` + `tags` so the orchestrator's per-term iteration still works.
*
* Per Remote OK's TOS we send a descriptive User-Agent so they can identify
* traffic; we do not strip the legal/attribution element from the response.
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const API_URL = "https://remoteok.com/api";
const USER_AGENT =
"Mozilla/5.0 (compatible; JobOps/1.0; +https://github.com/) job-search pipeline";
interface RemoteOkJob {
id?: string | number;
slug?: string;
position?: string;
company?: string;
company_logo?: string;
logo?: string;
location?: string;
tags?: string[];
description?: string;
url?: string;
apply_url?: string;
date?: string;
epoch?: number;
salary_min?: number;
salary_max?: number;
}
interface RemoteOkLegalEntry {
legal?: string;
last_updated?: number;
}
type RemoteOkResponseEntry = RemoteOkJob | RemoteOkLegalEntry;
function asString(value: unknown): string | undefined {
if (typeof value !== "string") return undefined;
const trimmed = value.trim();
return trimmed ? trimmed : undefined;
}
function isJobEntry(entry: RemoteOkResponseEntry): entry is RemoteOkJob {
return (
"id" in entry || "position" in entry || "url" in entry || "slug" in entry
);
}
function tagMatchesTerm(job: RemoteOkJob, normalizedTerm: string): boolean {
if (!normalizedTerm) return true;
const haystack = [
job.position ?? "",
...(Array.isArray(job.tags) ? job.tags : []),
]
.join(" ")
.toLowerCase();
return haystack.includes(normalizedTerm);
}
function mapJob(job: RemoteOkJob): CreateJobInput | null {
const jobUrl = asString(job.url) ?? asString(job.apply_url);
if (!jobUrl) return null;
// Remote OK reports salary as raw numbers; 0 means "not specified".
const minSalary =
typeof job.salary_min === "number" && job.salary_min > 0
? job.salary_min
: undefined;
const maxSalary =
typeof job.salary_max === "number" && job.salary_max > 0
? job.salary_max
: undefined;
const tags = Array.isArray(job.tags)
? job.tags.filter((tag): tag is string => typeof tag === "string")
: [];
return {
source: "remoteok",
sourceJobId: job.id != null ? String(job.id) : asString(job.slug),
title: asString(job.position) ?? "Unknown Title",
employer: asString(job.company) ?? "Unknown Employer",
jobUrl,
applicationLink: asString(job.apply_url) ?? jobUrl,
location: asString(job.location) ?? "Remote",
isRemote: true,
datePosted: asString(job.date),
jobDescription: asString(job.description),
companyLogo: asString(job.company_logo) ?? asString(job.logo),
disciplines: tags.length > 0 ? tags.join(", ") : undefined,
salaryMinAmount: minSalary,
salaryMaxAmount: maxSalary,
salaryCurrency: minSalary || maxSalary ? "USD" : undefined,
salaryInterval: minSalary || maxSalary ? "yearly" : undefined,
};
}
async function fetchAll(): Promise<RemoteOkJob[]> {
const response = await fetch(API_URL, {
headers: {
Accept: "application/json",
"User-Agent": USER_AGENT,
},
});
if (!response.ok) {
throw new Error(`Remote OK request failed with status ${response.status}`);
}
const body = (await response.json()) as RemoteOkResponseEntry[];
if (!Array.isArray(body)) return [];
return body.filter(isJobEntry);
}
export const manifest: ExtractorManifest = {
id: "remoteok",
displayName: "Remote OK",
providesSources: ["remoteok"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const maxJobsPerTerm = context.settings.remoteokMaxJobsPerTerm
? Number.parseInt(context.settings.remoteokMaxJobsPerTerm, 10)
: 100;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [""];
let allJobs: RemoteOkJob[];
try {
allJobs = await fetchAll();
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: [], error: message };
}
const seen = new Set<string>();
const out: CreateJobInput[] = [];
for (let i = 0; i < terms.length; i += 1) {
if (context.shouldCancel?.()) break;
const term = terms[i].trim();
const normalizedTerm = term.toLowerCase();
context.onProgress?.({
phase: "list",
termsProcessed: i,
termsTotal: terms.length,
currentUrl: term || "(all remote)",
detail: `Remote OK: term ${i + 1}/${terms.length}`,
});
let collected = 0;
for (const job of allJobs) {
if (collected >= maxJobsPerTerm) break;
if (!tagMatchesTerm(job, normalizedTerm)) continue;
const mapped = mapJob(job);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
collected += 1;
}
context.onProgress?.({
phase: "list",
termsProcessed: i + 1,
termsTotal: terms.length,
currentUrl: term || "(all remote)",
jobPagesProcessed: out.length,
detail: `Remote OK: completed term ${i + 1}/${terms.length} (${collected} matched)`,
});
}
return { success: true, jobs: out };
},
};
export default manifest;