Reject hybrid or partial-office postings at ingest so the Remote badge and filters match fully remote roles. Cron can PATCH search geography, remote-only workplace types, and QA search terms before each scheduled pipeline run. Co-authored-by: Cursor <cursoragent@cursor.com>
295 lines
11 KiB
Bash
Executable File
295 lines
11 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
# Run Jobber pipeline, wait until it finishes, send summary + job links to Telegram.
|
|
# Secrets: copy scripts/jobber-cron.env.example to /root/.jobber-cron.env (chmod 600).
|
|
set -euo pipefail
|
|
|
|
ENV_FILE="${JOBBER_CRON_ENV:-/root/.jobber-cron.env}"
|
|
if [[ ! -f "$ENV_FILE" ]]; then
|
|
echo "Missing env file: $ENV_FILE (set JOBBER_CRON_ENV or create the default path)" >&2
|
|
exit 1
|
|
fi
|
|
# shellcheck source=/dev/null
|
|
source "$ENV_FILE"
|
|
|
|
: "${TELEGRAM_BOT_TOKEN:?Set TELEGRAM_BOT_TOKEN in $ENV_FILE}"
|
|
: "${TELEGRAM_CHAT_ID:?Set TELEGRAM_CHAT_ID in $ENV_FILE}"
|
|
|
|
BASE="${JOBOPS_URL:-http://127.0.0.1:3005}"
|
|
MAX_JOBS="${JOB_TELEGRAM_MAX_JOBS:-25}"
|
|
AUTH=()
|
|
if [[ -n "${BASIC_AUTH_USER:-}" && -n "${BASIC_AUTH_PASSWORD:-}" ]]; then
|
|
AUTH=(-u "${BASIC_AUTH_USER}:${BASIC_AUTH_PASSWORD}")
|
|
fi
|
|
|
|
tg_html_escape() {
|
|
printf '%s' "$1" | sed -e 's/&/\&/g' -e 's/</\</g' -e 's/>/\>/g'
|
|
}
|
|
|
|
tg_href_escape() {
|
|
printf '%s' "$1" | sed -e 's/&/\&/g' -e 's/"/\"/g'
|
|
}
|
|
|
|
send_tg_html() {
|
|
local msg="$1"
|
|
curl -sS -X POST "https://api.telegram.org/bot${TELEGRAM_BOT_TOKEN}/sendMessage" \
|
|
-H "Content-Type: application/json" \
|
|
-d "$(jq -n \
|
|
--arg c "$TELEGRAM_CHAT_ID" \
|
|
--arg t "$msg" \
|
|
'{chat_id: $c, text: $t, parse_mode: "HTML", disable_web_page_preview: true}')" >/dev/null
|
|
}
|
|
|
|
fetch_status() {
|
|
curl -sS --compressed "${AUTH[@]}" -H "Accept: application/json" \
|
|
"${BASE}/api/pipeline/status"
|
|
}
|
|
|
|
apply_cron_settings() {
|
|
local patch='{}'
|
|
if [[ -n "${JOBBER_CRON_SEARCH_CITIES:-}" ]]; then
|
|
patch="$(echo "$patch" | jq --arg v "$JOBBER_CRON_SEARCH_CITIES" '. + {searchCities: $v}')"
|
|
fi
|
|
if [[ -n "${JOBBER_CRON_JOBSPY_COUNTRY:-}" ]]; then
|
|
patch="$(echo "$patch" | jq --arg v "$JOBBER_CRON_JOBSPY_COUNTRY" '. + {jobspyCountryIndeed: $v}')"
|
|
fi
|
|
if [[ -n "${JOBBER_CRON_WORKPLACE_TYPES:-}" ]]; then
|
|
patch="$(echo "$patch" | jq --arg v "$JOBBER_CRON_WORKPLACE_TYPES" \
|
|
'. + {workplaceTypes: ($v | split(",") | map(gsub("^\\s+|\\s+$";"")) | map(select(. != "")))}')"
|
|
fi
|
|
if [[ -n "${JOBBER_CRON_SEARCH_TERMS:-}" ]]; then
|
|
patch="$(echo "$patch" | jq --arg v "$JOBBER_CRON_SEARCH_TERMS" \
|
|
'. + {searchTerms: ($v | split("|") | map(gsub("^\\s+|\\s+$";"")) | map(select(. != "")))}')"
|
|
fi
|
|
if [[ "$patch" == "{}" ]]; then
|
|
return 0
|
|
fi
|
|
local resp
|
|
resp="$(curl -sS --compressed "${AUTH[@]}" -X PATCH "${BASE}/api/settings" \
|
|
-H "Accept: application/json" -H "Content-Type: application/json" \
|
|
-d "$patch")"
|
|
if ! echo "$resp" | jq -e '.ok == true' >/dev/null 2>&1; then
|
|
send_tg_html "Jobber: PATCH /api/settings failed before cron run: $(tg_html_escape "$(echo "$resp" | jq -c . 2>/dev/null || echo "$resp")")"
|
|
exit 1
|
|
fi
|
|
}
|
|
|
|
fetch_jobs_list() {
|
|
curl -sS --compressed "${AUTH[@]}" -H "Accept: application/json" \
|
|
"${BASE}/api/jobs?view=list"
|
|
}
|
|
|
|
fetch_jobs_revision() {
|
|
curl -sS --compressed "${AUTH[@]}" -H "Accept: application/json" \
|
|
"${BASE}/api/jobs/revision"
|
|
}
|
|
|
|
# After a run, the jobs list can briefly lag; also catches flaky proxies.
|
|
fetch_jobs_list_when_ready() {
|
|
local expected_discovered="$1"
|
|
local resp=""
|
|
local n=0
|
|
local attempt=0
|
|
while [[ $attempt -lt 25 ]]; do
|
|
resp="$(fetch_jobs_list)"
|
|
if echo "$resp" | jq -e '.ok == true' >/dev/null 2>&1; then
|
|
n="$(echo "$resp" | jq -r '((.data // {}) | .jobs // []) | length')"
|
|
if [[ "$expected_discovered" -eq 0 ]] || [[ "$n" -gt 0 ]]; then
|
|
echo "$resp"
|
|
return 0
|
|
fi
|
|
fi
|
|
attempt=$((attempt + 1))
|
|
sleep 2
|
|
done
|
|
echo "$resp"
|
|
}
|
|
|
|
build_job_lines_html() {
|
|
local jobs_json="$1"
|
|
local started="$2"
|
|
local completed="$3"
|
|
local max_n="$4"
|
|
|
|
# Pipeline run times are ISO-8601 (…T…Z). Jobs often use SQLite datetime('now'): "YYYY-MM-DD HH:MM:SS".
|
|
# Raw string compare treats space before the clock as sorting before "T", so every SQLite-style
|
|
# discoveredAt incorrectly falls *before* the run window. Normalize for comparison only.
|
|
echo "$jobs_json" | jq -c --arg s "$started" --arg e "$completed" --argjson max "$max_n" '
|
|
def pickurl:
|
|
if (.jobUrl // "") != "" then .jobUrl
|
|
elif (.applicationLink // "") != "" then .applicationLink
|
|
else "" end;
|
|
def normalizeTs:
|
|
if . == null or . == "" then ""
|
|
elif test("[Tt]") then .
|
|
else sub(" "; "T")
|
|
end;
|
|
def pickrows($all):
|
|
if ($s == "" or $s == null) then
|
|
{ rows: ($all | sort_by(.discoveredAt | normalizeTs) | reverse), usedFallback: true }
|
|
else
|
|
($all
|
|
| map(select(
|
|
($e != "" and ((.discoveredAt | normalizeTs) >= ($s | normalizeTs)) and ((.discoveredAt | normalizeTs) <= ($e | normalizeTs)))
|
|
))) as $win |
|
|
if ($win | length) > 0 then { rows: $win, usedFallback: false }
|
|
else
|
|
($all | map(select(((.discoveredAt | normalizeTs) >= ($s | normalizeTs))))) as $from |
|
|
if ($from | length) > 0 then { rows: $from, usedFallback: false }
|
|
else
|
|
{ rows: ($all | sort_by(.discoveredAt | normalizeTs) | reverse), usedFallback: true }
|
|
end
|
|
end
|
|
end;
|
|
(((.data // {}) | .jobs) // []) as $all |
|
|
if ($all | length) == 0 then
|
|
{total: 0, lines: [], usedFallback: false}
|
|
else
|
|
pickrows($all) as $picked |
|
|
($picked.rows | sort_by(.discoveredAt | normalizeTs) | reverse) as $sorted |
|
|
($sorted | length) as $total |
|
|
($sorted | .[0:max]) as $slice |
|
|
{
|
|
total: $total,
|
|
usedFallback: $picked.usedFallback,
|
|
lines: [
|
|
$slice[] |
|
|
{
|
|
title: (.title // "Untitled"),
|
|
url: pickurl,
|
|
employer: (.employer // "")
|
|
}
|
|
]
|
|
}
|
|
end
|
|
'
|
|
}
|
|
|
|
append_lines_from_json() {
|
|
local sel="$1"
|
|
local -n _out="$2"
|
|
local item line title url emp
|
|
while IFS= read -r item; do
|
|
[[ -z "$item" || "$item" == "null" ]] && continue
|
|
title="$(echo "$item" | jq -r '.title // "Untitled"')"
|
|
url="$(echo "$item" | jq -r '.url // ""')"
|
|
emp="$(echo "$item" | jq -r '.employer // ""')"
|
|
if [[ -n "$url" && "$url" != "null" ]]; then
|
|
line="$(tg_html_escape "$emp") — <a href=\"$(tg_href_escape "$url")\">$(tg_html_escape "$title")</a>"
|
|
else
|
|
line="$(tg_html_escape "$emp") — $(tg_html_escape "$title") <i>(no URL)</i>"
|
|
fi
|
|
_out+=$'\n'"${line}"
|
|
done < <(echo "$sel" | jq -c '.lines[]? // empty')
|
|
}
|
|
|
|
body="$(fetch_status)"
|
|
if ! echo "$body" | jq -e '.ok == true' >/dev/null 2>&1; then
|
|
send_tg_html "Jobber: /api/pipeline/status failed (before run). Check container."
|
|
exit 1
|
|
fi
|
|
|
|
if echo "$body" | jq -e '.data.isRunning == true' >/dev/null 2>&1; then
|
|
send_tg_html "Jobber: pipeline already running; skipping scheduled run."
|
|
exit 0
|
|
fi
|
|
|
|
apply_cron_settings
|
|
|
|
# Optional: comma-separated sources (see JOBBER_PIPELINE_SOURCES in jobber-cron.env.example).
|
|
# If unset, POST body is {} and the server uses its default source list.
|
|
run_body='{}'
|
|
if [[ -n "${JOBBER_PIPELINE_SOURCES:-}" ]]; then
|
|
run_body="$(jq -n --arg s "$JOBBER_PIPELINE_SOURCES" \
|
|
'$s | split(",") | map(gsub("^\\s+|\\s+$";"")) | map(select(. != "")) | {sources: .}')"
|
|
fi
|
|
|
|
resp="$(curl -sS --compressed "${AUTH[@]}" -X POST "${BASE}/api/pipeline/run" \
|
|
-H "Accept: application/json" -H "Content-Type: application/json" -d "$run_body")"
|
|
if ! echo "$resp" | jq -e '.ok == true' >/dev/null 2>&1; then
|
|
_fail_json="$(echo "$resp" | jq -c . 2>/dev/null || echo "$resp")"
|
|
send_tg_html "Jobber: POST /api/pipeline/run failed: $(tg_html_escape "$_fail_json")"
|
|
exit 1
|
|
fi
|
|
|
|
was_running=0
|
|
for _ in $(seq 1 720); do
|
|
sleep 30
|
|
body="$(fetch_status)"
|
|
if ! echo "$body" | jq -e '.ok == true' >/dev/null 2>&1; then
|
|
send_tg_html "Jobber: status check failed mid-run."
|
|
exit 1
|
|
fi
|
|
running="$(echo "$body" | jq -r '.data.isRunning')"
|
|
if [[ "$running" == "true" ]]; then
|
|
was_running=1
|
|
elif [[ "$was_running" -eq 1 ]]; then
|
|
lr="$(echo "$body" | jq '.data.lastRun')"
|
|
st="$(echo "$lr" | jq -r '.status // "unknown"')"
|
|
disc="$(echo "$lr" | jq -r '.jobsDiscovered // 0')"
|
|
proc="$(echo "$lr" | jq -r '.jobsProcessed // 0')"
|
|
err="$(echo "$lr" | jq -r '.errorMessage // empty')"
|
|
started="$(echo "$lr" | jq -r '.startedAt // ""')"
|
|
completed="$(echo "$lr" | jq -r '.completedAt // ""')"
|
|
|
|
msg="<b>Jobber</b> pipeline: <b>$(tg_html_escape "$st")</b>"
|
|
msg+=$'\n'"Discovered: ${disc}, processed: ${proc}."
|
|
[[ -n "$err" ]] && msg+=$'\n'"<b>Error:</b> $(tg_html_escape "$err")"
|
|
|
|
jobs_resp="$(fetch_jobs_list_when_ready "$disc")"
|
|
if echo "$jobs_resp" | jq -e '.ok == true' >/dev/null 2>&1; then
|
|
list_n="$(echo "$jobs_resp" | jq -r '((.data // {}) | .jobs // []) | length')"
|
|
if ! sel="$(build_job_lines_html "$jobs_resp" "$started" "$completed" "$MAX_JOBS")"; then
|
|
sel='{"total":0,"lines":[],"usedFallback":false,"jqError":true}'
|
|
fi
|
|
total="$(echo "$sel" | jq -r '.total // 0')"
|
|
shown="$(echo "$sel" | jq -r '.lines | length')"
|
|
used_fb="$(echo "$sel" | jq -r '.usedFallback // false')"
|
|
jq_err="$(echo "$sel" | jq -r '.jqError // false')"
|
|
if [[ "$total" -gt 0 ]]; then
|
|
if [[ "$used_fb" == "true" ]]; then
|
|
msg+=$'\n\n'"<b>Recent jobs</b> (showing ${shown} of ${total}; time window did not match — links may include older discoveries):"
|
|
else
|
|
msg+=$'\n\n'"<b>Jobs in this run</b> (showing ${shown} of ${total}):"
|
|
fi
|
|
append_lines_from_json "$sel" msg
|
|
rest=$((total - shown))
|
|
if [[ "$rest" -gt 0 ]]; then
|
|
msg+=$'\n\n'"<i>…and ${rest} more not shown.</i>"
|
|
fi
|
|
else
|
|
rev_json="$(fetch_jobs_revision)"
|
|
rev_ok="$(echo "$rev_json" | jq -r 'if .ok == true then "1" else "0" end')"
|
|
rev_total="-1"
|
|
if [[ "$rev_ok" == "1" ]]; then
|
|
rev_total="$(echo "$rev_json" | jq -r '(.data.total // 0)')"
|
|
fi
|
|
msg+=$'\n\n'"<i>No job lines to show (list payload: ${list_n} rows)."
|
|
if [[ "$jq_err" == "true" ]]; then
|
|
msg+=" JSON/jq error while filtering.</i>"
|
|
else
|
|
msg+="</i>"
|
|
fi
|
|
if [[ "$rev_ok" == "1" ]]; then
|
|
msg+=$'\n'"<i>GET /api/jobs/revision reports <b>${rev_total}</b> jobs in DB.</i>"
|
|
if [[ "$rev_total" -gt 0 && "$list_n" -eq 0 ]]; then
|
|
msg+=$'\n'"<i>List response empty but DB has jobs — check reverse-proxy body limits, or multiple instances with different data dirs.</i>"
|
|
elif [[ "$rev_total" -eq 0 && "$disc" -gt 0 ]]; then
|
|
msg+=$'\n'"<i>Pipeline run reported ${disc} discovered but DB job count is 0 — wrong <code>JOBOPS_URL</code> (different server), or DB reset since the run.</i>"
|
|
fi
|
|
else
|
|
msg+=$'\n'"<i>Could not read /api/jobs/revision for diagnostics.</i>"
|
|
fi
|
|
msg+=$'\n'"<i>Open the app: $(tg_html_escape "${BASE}")</i>"
|
|
fi
|
|
else
|
|
msg+=$'\n\n'"<i>Could not load GET /api/jobs for links.</i>"
|
|
fi
|
|
|
|
send_tg_html "$msg"
|
|
exit 0
|
|
fi
|
|
done
|
|
|
|
send_tg_html "Jobber: timed out waiting for pipeline (6h). Check server."
|
|
exit 1
|