Some checks failed
CI / Linting (Biome) (push) Failing after 36s
CI / Tests (push) Successful in 5m54s
CI / Type Check (adzuna-extractor) (push) Successful in 1m6s
CI / Type Check (gradcracker-extractor) (push) Successful in 1m9s
CI / Type Check (hiringcafe-extractor) (push) Successful in 1m5s
CI / Type Check (orchestrator) (push) Successful in 1m21s
CI / Type Check (startupjobs-extractor) (push) Successful in 1m4s
CI / Type Check (ukvisajobs-extractor) (push) Successful in 1m4s
CI / Documentation (push) Successful in 1m52s
Adds extractor packages: arbeitnow, ashby, careerjet, fourdayweek,
greenhouse, himalayas, jobicy, jooble, lever, reed, remoteok, remotive,
themuse, usajobs, weworkremotely, workday — each with manifest, package
metadata and README.
Pipeline / shared:
- shared/job-fingerprint: stable hash for cross-source dedup, with tests
- discover-jobs: dedup via fingerprint and richer per-source merging
- jobs repository: fingerprint-aware upsert / lookup
- settings-registry, settings types/routes, demo-defaults: knobs for the
new sources
- shared extractors index: register the new manifests
- location-support, profiles route: small fixes for the new sources
Tooling:
- scripts/smoke-extractors.ts to sanity-check each source locally
- scripts/jobber-cron-{cherepaha,dobkin}.env.example: per-host cron
templates (CHANGEME placeholders only)
- .env.example: documented env vars for the new extractors
- .gitignore: ignore extractors/*/storage/ runtime caches (was ukvisajobs only)
Co-authored-by: Cursor <cursoragent@cursor.com>
193 lines
5.4 KiB
TypeScript
193 lines
5.4 KiB
TypeScript
/**
|
|
* We Work Remotely — public RSS feed.
|
|
*
|
|
* https://weworkremotely.com/remote-jobs.rss
|
|
*
|
|
* No auth. Returns all recent listings in a single XML feed.
|
|
* We filter client-side by matching title + skills + category
|
|
* against each pipeline search term.
|
|
*
|
|
* Title format from WWR: "Company Name: Job Title"
|
|
*/
|
|
|
|
import type {
|
|
ExtractorManifest,
|
|
ExtractorRunResult,
|
|
} from "@shared/types/extractors";
|
|
import type { CreateJobInput } from "@shared/types/jobs";
|
|
|
|
const RSS_URL = "https://weworkremotely.com/remote-jobs.rss";
|
|
|
|
interface WwrItem {
|
|
title?: string;
|
|
link?: string;
|
|
guid?: string;
|
|
description?: string;
|
|
pubDate?: string;
|
|
region?: string;
|
|
country?: string;
|
|
skills?: string;
|
|
category?: string;
|
|
type?: string;
|
|
logoUrl?: string;
|
|
}
|
|
|
|
function xmlText(xml: string, tag: string): string | undefined {
|
|
const pattern = new RegExp(`<${tag}[^>]*>([\\s\\S]*?)</${tag}>`);
|
|
const match = xml.match(pattern);
|
|
if (!match?.[1]) return undefined;
|
|
return (
|
|
match[1].replace(/<!\[CDATA\[([\s\S]*?)\]\]>/g, "$1").trim() || undefined
|
|
);
|
|
}
|
|
|
|
function parseItems(xml: string): WwrItem[] {
|
|
const items: WwrItem[] = [];
|
|
const blocks = xml.match(/<item>([\s\S]*?)<\/item>/g) ?? [];
|
|
|
|
for (const raw of blocks) {
|
|
const block = raw.replace(/^<item>/, "").replace(/<\/item>$/, "");
|
|
const logoMatch = block.match(/media:content\s+url="([^"]+)"/);
|
|
|
|
items.push({
|
|
title: xmlText(block, "title"),
|
|
link: xmlText(block, "link"),
|
|
guid: xmlText(block, "guid"),
|
|
description: xmlText(block, "description"),
|
|
pubDate: xmlText(block, "pubDate"),
|
|
region: xmlText(block, "region"),
|
|
country: xmlText(block, "country"),
|
|
skills: xmlText(block, "skills"),
|
|
category: xmlText(block, "category"),
|
|
type: xmlText(block, "type"),
|
|
logoUrl: logoMatch?.[1],
|
|
});
|
|
}
|
|
|
|
return items;
|
|
}
|
|
|
|
function parseTitle(raw: string): { employer: string; title: string } {
|
|
const colonIdx = raw.indexOf(": ");
|
|
if (colonIdx > 0) {
|
|
return {
|
|
employer: raw.slice(0, colonIdx).trim(),
|
|
title: raw.slice(colonIdx + 2).trim(),
|
|
};
|
|
}
|
|
return { employer: "Unknown Employer", title: raw.trim() };
|
|
}
|
|
|
|
function matchesTerm(item: WwrItem, term: string): boolean {
|
|
const lower = term.toLowerCase();
|
|
if (item.title?.toLowerCase().includes(lower)) return true;
|
|
if (item.skills?.toLowerCase().includes(lower)) return true;
|
|
if (item.category?.toLowerCase().includes(lower)) return true;
|
|
return false;
|
|
}
|
|
|
|
function decodeHtmlEntities(html: string): string {
|
|
return html
|
|
.replace(/&/g, "&")
|
|
.replace(/</g, "<")
|
|
.replace(/>/g, ">");
|
|
}
|
|
|
|
function mapJob(item: WwrItem): CreateJobInput | null {
|
|
const jobUrl = item.link || item.guid;
|
|
if (!jobUrl) return null;
|
|
|
|
const rawTitle = item.title
|
|
? decodeHtmlEntities(item.title)
|
|
: "Unknown Title";
|
|
const { employer, title } = parseTitle(rawTitle);
|
|
|
|
const location =
|
|
[item.region, item.country].filter(Boolean).join(" — ") || "Remote";
|
|
|
|
return {
|
|
source: "weworkremotely",
|
|
sourceJobId: item.guid ?? item.link,
|
|
title,
|
|
employer,
|
|
jobUrl,
|
|
applicationLink: jobUrl,
|
|
location,
|
|
isRemote: true,
|
|
jobType: item.type || undefined,
|
|
companyLogo: item.logoUrl,
|
|
datePosted: item.pubDate,
|
|
jobDescription: item.description
|
|
? decodeHtmlEntities(item.description)
|
|
: undefined,
|
|
disciplines: item.skills || undefined,
|
|
companyIndustry: item.category || undefined,
|
|
};
|
|
}
|
|
|
|
export const manifest: ExtractorManifest = {
|
|
id: "weworkremotely",
|
|
displayName: "We Work Remotely",
|
|
providesSources: ["weworkremotely"],
|
|
async run(context): Promise<ExtractorRunResult> {
|
|
if (context.shouldCancel?.()) return { success: true, jobs: [] };
|
|
|
|
const maxJobs = context.settings.weworkremotelyMaxJobsPerTerm
|
|
? Number.parseInt(context.settings.weworkremotelyMaxJobsPerTerm, 10)
|
|
: 100;
|
|
|
|
const terms = context.searchTerms.length > 0 ? context.searchTerms : [];
|
|
|
|
context.onProgress?.({
|
|
phase: "list",
|
|
termsProcessed: 0,
|
|
termsTotal: 1,
|
|
currentUrl: RSS_URL,
|
|
detail: "We Work Remotely: fetching RSS feed",
|
|
});
|
|
|
|
try {
|
|
const response = await fetch(RSS_URL, {
|
|
headers: { Accept: "application/rss+xml, application/xml, text/xml" },
|
|
});
|
|
if (!response.ok) {
|
|
throw new Error(`WWR RSS failed with status ${response.status}`);
|
|
}
|
|
const xml = await response.text();
|
|
const items = parseItems(xml);
|
|
|
|
const seen = new Set<string>();
|
|
const out: CreateJobInput[] = [];
|
|
|
|
for (const item of items) {
|
|
if (out.length >= maxJobs * Math.max(terms.length, 1)) break;
|
|
if (terms.length > 0 && !terms.some((t) => matchesTerm(item, t))) {
|
|
continue;
|
|
}
|
|
const mapped = mapJob(item);
|
|
if (!mapped) continue;
|
|
const key = mapped.sourceJobId || mapped.jobUrl;
|
|
if (seen.has(key)) continue;
|
|
seen.add(key);
|
|
out.push(mapped);
|
|
}
|
|
|
|
context.onProgress?.({
|
|
phase: "list",
|
|
termsProcessed: 1,
|
|
termsTotal: 1,
|
|
currentUrl: RSS_URL,
|
|
jobPagesProcessed: out.length,
|
|
detail: `We Work Remotely: ${out.length} matched from ${items.length} total`,
|
|
});
|
|
|
|
return { success: true, jobs: out };
|
|
} catch (error) {
|
|
const message = error instanceof Error ? error.message : "Unknown error";
|
|
return { success: false, jobs: [], error: message };
|
|
}
|
|
},
|
|
};
|
|
|
|
export default manifest;
|