ilia 7b3dfb002a
Some checks failed
CI / Linting (Biome) (push) Failing after 36s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m6s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m9s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m5s
CI / Type Check (orchestrator) (push) Successful in 1m21s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m4s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m4s
CI / Documentation (push) Successful in 1m52s
feat(extractors): add 17 job source extractors and cross-source dedup
Adds extractor packages: arbeitnow, ashby, careerjet, fourdayweek,
greenhouse, himalayas, jobicy, jooble, lever, reed, remoteok, remotive,
themuse, usajobs, weworkremotely, workday — each with manifest, package
metadata and README.

Pipeline / shared:
- shared/job-fingerprint: stable hash for cross-source dedup, with tests
- discover-jobs: dedup via fingerprint and richer per-source merging
- jobs repository: fingerprint-aware upsert / lookup
- settings-registry, settings types/routes, demo-defaults: knobs for the
  new sources
- shared extractors index: register the new manifests
- location-support, profiles route: small fixes for the new sources

Tooling:
- scripts/smoke-extractors.ts to sanity-check each source locally
- scripts/jobber-cron-{cherepaha,dobkin}.env.example: per-host cron
  templates (CHANGEME placeholders only)
- .env.example: documented env vars for the new extractors
- .gitignore: ignore extractors/*/storage/ runtime caches (was ukvisajobs only)

Co-authored-by: Cursor <cursoragent@cursor.com>
2026-05-12 20:17:52 -04:00

193 lines
5.4 KiB
TypeScript

/**
* We Work Remotely — public RSS feed.
*
* https://weworkremotely.com/remote-jobs.rss
*
* No auth. Returns all recent listings in a single XML feed.
* We filter client-side by matching title + skills + category
* against each pipeline search term.
*
* Title format from WWR: "Company Name: Job Title"
*/
import type {
ExtractorManifest,
ExtractorRunResult,
} from "@shared/types/extractors";
import type { CreateJobInput } from "@shared/types/jobs";
const RSS_URL = "https://weworkremotely.com/remote-jobs.rss";
interface WwrItem {
title?: string;
link?: string;
guid?: string;
description?: string;
pubDate?: string;
region?: string;
country?: string;
skills?: string;
category?: string;
type?: string;
logoUrl?: string;
}
function xmlText(xml: string, tag: string): string | undefined {
const pattern = new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`);
const match = xml.match(pattern);
if (!match?.[1]) return undefined;
return (
match[1].replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, "$1").trim() || undefined
);
}
function parseItems(xml: string): WwrItem[] {
const items: WwrItem[] = [];
const blocks = xml.match(/<item>([\s\S]*?)<\/item>/g) ?? [];
for (const raw of blocks) {
const block = raw.replace(/^<item>/, "").replace(/<\/item>$/, "");
const logoMatch = block.match(/media:content\s+url="([^"]+)"/);
items.push({
title: xmlText(block, "title"),
link: xmlText(block, "link"),
guid: xmlText(block, "guid"),
description: xmlText(block, "description"),
pubDate: xmlText(block, "pubDate"),
region: xmlText(block, "region"),
country: xmlText(block, "country"),
skills: xmlText(block, "skills"),
category: xmlText(block, "category"),
type: xmlText(block, "type"),
logoUrl: logoMatch?.[1],
});
}
return items;
}
function parseTitle(raw: string): { employer: string; title: string } {
const colonIdx = raw.indexOf(": ");
if (colonIdx > 0) {
return {
employer: raw.slice(0, colonIdx).trim(),
title: raw.slice(colonIdx + 2).trim(),
};
}
return { employer: "Unknown Employer", title: raw.trim() };
}
function matchesTerm(item: WwrItem, term: string): boolean {
const lower = term.toLowerCase();
if (item.title?.toLowerCase().includes(lower)) return true;
if (item.skills?.toLowerCase().includes(lower)) return true;
if (item.category?.toLowerCase().includes(lower)) return true;
return false;
}
function decodeHtmlEntities(html: string): string {
return html
.replace(/&amp;/g, "&")
.replace(/&lt;/g, "<")
.replace(/&gt;/g, ">");
}
function mapJob(item: WwrItem): CreateJobInput | null {
const jobUrl = item.link || item.guid;
if (!jobUrl) return null;
const rawTitle = item.title
? decodeHtmlEntities(item.title)
: "Unknown Title";
const { employer, title } = parseTitle(rawTitle);
const location =
[item.region, item.country].filter(Boolean).join(" — ") || "Remote";
return {
source: "weworkremotely",
sourceJobId: item.guid ?? item.link,
title,
employer,
jobUrl,
applicationLink: jobUrl,
location,
isRemote: true,
jobType: item.type || undefined,
companyLogo: item.logoUrl,
datePosted: item.pubDate,
jobDescription: item.description
? decodeHtmlEntities(item.description)
: undefined,
disciplines: item.skills || undefined,
companyIndustry: item.category || undefined,
};
}
export const manifest: ExtractorManifest = {
id: "weworkremotely",
displayName: "We Work Remotely",
providesSources: ["weworkremotely"],
async run(context): Promise<ExtractorRunResult> {
if (context.shouldCancel?.()) return { success: true, jobs: [] };
const maxJobs = context.settings.weworkremotelyMaxJobsPerTerm
? Number.parseInt(context.settings.weworkremotelyMaxJobsPerTerm, 10)
: 100;
const terms = context.searchTerms.length > 0 ? context.searchTerms : [];
context.onProgress?.({
phase: "list",
termsProcessed: 0,
termsTotal: 1,
currentUrl: RSS_URL,
detail: "We Work Remotely: fetching RSS feed",
});
try {
const response = await fetch(RSS_URL, {
headers: { Accept: "application/rss+xml, application/xml, text/xml" },
});
if (!response.ok) {
throw new Error(`WWR RSS failed with status ${response.status}`);
}
const xml = await response.text();
const items = parseItems(xml);
const seen = new Set<string>();
const out: CreateJobInput[] = [];
for (const item of items) {
if (out.length >= maxJobs * Math.max(terms.length, 1)) break;
if (terms.length > 0 && !terms.some((t) => matchesTerm(item, t))) {
continue;
}
const mapped = mapJob(item);
if (!mapped) continue;
const key = mapped.sourceJobId || mapped.jobUrl;
if (seen.has(key)) continue;
seen.add(key);
out.push(mapped);
}
context.onProgress?.({
phase: "list",
termsProcessed: 1,
termsTotal: 1,
currentUrl: RSS_URL,
jobPagesProcessed: out.length,
detail: `We Work Remotely: ${out.length} matched from ${items.length} total`,
});
return { success: true, jobs: out };
} catch (error) {
const message = error instanceof Error ? error.message : "Unknown error";
return { success: false, jobs: [], error: message };
}
},
};
export default manifest;