linkedout/job-search-parser/parsers/skipthedrive.js

/**
 * SkipTheDrive Job Parser
 *
 * Parses remote job listings from SkipTheDrive.com
 * Supports keyword search, job type filters, and pagination
 */

const { chromium } = require("playwright");
const path = require("path");

// Import from ai-analyzer core package
const {
  logger,
  cleanText,
  containsAnyKeyword,
  containsAllKeywords,
  parseLocationFilters,
  validateLocationAgainstFilters,
  extractLocationFromProfile,
  analyzeBatch,
  checkOllamaStatus,
} = require("../../ai-analyzer");

/**
 * Build search URL for SkipTheDrive
 * @param {string} keyword - Search keyword
 * @param {string} orderBy - Sort order (date, relevance)
 * @param {Array<string>} jobTypes - Job types to filter (part time, full time, contract)
 * @returns {string} - Formatted search URL
 */
function buildSearchUrl(keyword, orderBy = "date", jobTypes = []) {
  let url = `https://www.skipthedrive.com/?s=${encodeURIComponent(keyword)}`;

  if (orderBy) {
    url += `&orderby=${orderBy}`;
  }

  // Add job type filters
  jobTypes.forEach((type) => {
    url += `&jobtype=${encodeURIComponent(type)}`;
  });

  return url;
}

/**
 * Extract job data from a single job listing element
 * @param {Element} article - Job listing DOM element
 * @returns {Object} - Extracted job data
 */
async function extractJobData(article) {
  try {
    // Extract job title and URL
    const titleElement = await article.$("h2.post-title a");
    const title = titleElement ? await titleElement.textContent() : "";
    const jobUrl = titleElement ? await titleElement.getAttribute("href") : "";

    // Extract date
    const dateElement = await article.$("time.post-date");
    const datePosted = dateElement
      ? await dateElement.getAttribute("datetime")
      : "";
    const dateText = dateElement ? await dateElement.textContent() : "";

    // Extract company name
    const companyElement = await article.$(
      ".custom_fields_company_name_display_search_results"
    );
    let company = companyElement ? await companyElement.textContent() : "";
    company = company.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon

    // Extract days ago
    const daysAgoElement = await article.$(
      ".custom_fields_job_date_display_search_results"
    );
    let daysAgo = daysAgoElement ? await daysAgoElement.textContent() : "";
    daysAgo = daysAgo.replace(/^\s*[^\s]+\s*/, "").trim(); // Remove icon

    // Extract job description excerpt
    const excerptElement = await article.$(".excerpt_part");
    const description = excerptElement
      ? await excerptElement.textContent()
      : "";

    // Check if featured/sponsored
    const featuredElement = await article.$(".custom_fields_sponsored_job");
    const isFeatured = !!featuredElement;

    // Extract job ID from article ID
    const articleId = await article.getAttribute("id");
    const jobId = articleId ? articleId.replace("post-", "") : "";

    return {
      jobId,
      title: cleanText(title),
      company: cleanText(company),
      jobUrl,
      datePosted,
      dateText: cleanText(dateText),
      daysAgo: cleanText(daysAgo),
      description: cleanText(description),
      isFeatured,
      source: "skipthedrive",
      timestamp: new Date().toISOString(),
    };
  } catch (error) {
    logger.error(`Error extracting job data: ${error.message}`);
    return null;
  }
}

/**
 * Parse SkipTheDrive job listings
 * @param {Object} options - Parser options
 * @returns {Promise<Array>} - Array of parsed job listings
 */
async function parseSkipTheDrive(options = {}) {
  const {
    keywords = process.env.SEARCH_KEYWORDS?.split(",").map((k) => k.trim()) || [
      "software engineer",
      "developer",
    ],
    jobTypes = process.env.JOB_TYPES?.split(",").map((t) => t.trim()) || [],
    locationFilter = process.env.LOCATION_FILTER || "",
    maxPages = parseInt(process.env.MAX_PAGES) || 5,
    headless = process.env.HEADLESS !== "false",
    enableAI = process.env.ENABLE_AI_ANALYSIS === "true",
    aiContext = process.env.AI_CONTEXT || "remote job opportunities analysis",
    useAndLogic = false, // Use AND logic instead of OR logic for keywords
  } = options;

  logger.step("Starting SkipTheDrive parser...");
  logger.info(`🔍 Keywords: ${keywords.join(", ")}`);
  logger.info(`🔗 Keyword Logic: ${useAndLogic ? "AND (all keywords must match)" : "OR (any keyword matches)"}`);
  logger.info(
    `📋 Job Types: ${jobTypes.length > 0 ? jobTypes.join(", ") : "All"}`
  );
  logger.info(`📍 Location Filter: ${locationFilter || "None"}`);
  logger.info(`📄 Max Pages: ${maxPages}`);

  const browser = await chromium.launch({
    headless,
    args: [
      "--no-sandbox",
      "--disable-setuid-sandbox",
      "--disable-dev-shm-usage",
    ],
  });

  const context = await browser.newContext({
    userAgent:
      "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
  });

  const results = [];
  const rejectedResults = [];
  const seenJobs = new Set();

  try {
    // For AND logic, combine all keywords into a single search query
    // For OR logic, search each keyword separately
    const searchKeywords = useAndLogic ? [keywords.join(" ")] : keywords;

    // Search for each keyword (or combined keyword for AND logic)
    for (const keyword of searchKeywords) {
      logger.info(`\n🔍 Searching for: ${keyword}`);

      const searchUrl = buildSearchUrl(keyword, "date", jobTypes);
      const page = await context.newPage();

      try {
        logger.info(
          `Attempting navigation to: ${searchUrl} at ${new Date().toISOString()}`
        );
        await page.goto(searchUrl, {
          waitUntil: "domcontentloaded",
          timeout: 30000,
        });
        logger.info(
          `Navigation completed successfully at ${new Date().toISOString()}`
        );

        // Wait for job listings to load
        logger.info("Waiting for selector #loops-wrapper");
        await page
          .waitForSelector("#loops-wrapper", { timeout: 5000 })
          .catch(() => {
            logger.warning(`No results found for keyword: ${keyword}`);
          });
        logger.info("Selector wait completed");

        let currentPage = 1;
        let hasNextPage = true;

        while (hasNextPage && currentPage <= maxPages) {
          logger.info(`📄 Processing page ${currentPage} for "${keyword}"`);

          // Extract all job articles on current page
          const jobArticles = await page.$$("article[id^='post-']");
          logger.info(
            `Found ${jobArticles.length} job listings on page ${currentPage}`
          );

          for (const article of jobArticles) {
            const jobData = await extractJobData(article);

            if (!jobData || seenJobs.has(jobData.jobId)) {
              continue;
            }

            seenJobs.add(jobData.jobId);

            // Add keyword that found this job
            jobData.searchKeyword = keyword;

            // Validate job against keywords
            const fullText = `${jobData.title} ${jobData.description} ${jobData.company}`;
            const keywordMatch = useAndLogic
              ? containsAllKeywords(fullText, keywords)
              : containsAnyKeyword(fullText, keywords);

            if (!keywordMatch) {
              rejectedResults.push({
                ...jobData,
                rejected: true,
                reason: useAndLogic
                  ? "Not all keywords found in job listing"
                  : "Keywords not found in job listing",
              });
              continue;
            }

            // Location validation (if enabled)
            if (locationFilter) {
              const locationFilters = parseLocationFilters(locationFilter);
              // For SkipTheDrive, most jobs are remote, but we can check the title/description
              const locationValid =
                fullText.toLowerCase().includes("remote") ||
                locationFilters.some((filter) =>
                  fullText.toLowerCase().includes(filter.toLowerCase())
                );

              if (!locationValid) {
                rejectedResults.push({
                  ...jobData,
                  rejected: true,
                  reason: "Location requirements not met",
                });
                continue;
              }

              jobData.locationValid = locationValid;
            }

            logger.success(`✅ Found: ${jobData.title} at ${jobData.company}`);
            results.push(jobData);
          }

          // Check for next page
          const nextPageLink = await page.$("a.nextp");
          if (nextPageLink && currentPage < maxPages) {
            logger.info("📄 Moving to next page...");
            await nextPageLink.click();
            await page.waitForLoadState("domcontentloaded");
            await page.waitForTimeout(2000); // Wait for content to load
            currentPage++;
          } else {
            hasNextPage = false;
          }
        }
      } catch (error) {
        logger.error(`Error processing keyword "${keyword}": ${error.message}`);
      } finally {
        await page.close();
      }
    }

    logger.success(`\n✅ Parsing complete!`);
    logger.info(`📊 Total jobs found: ${results.length}`);
    logger.info(`❌ Rejected jobs: ${rejectedResults.length}`);

    // Run AI analysis if enabled
    let aiAnalysis = null;
    if (enableAI && results.length > 0) {
      logger.step("Running AI analysis on job listings...");

      const aiAvailable = await checkOllamaStatus();
      if (aiAvailable) {
        const analysisData = results.map((job) => ({
          text: `${job.title} at ${job.company}. ${job.description}`,
          metadata: {
            jobId: job.jobId,
            company: job.company,
            daysAgo: job.daysAgo,
          },
        }));

        aiAnalysis = await analyzeBatch(analysisData, aiContext);

        // Merge AI analysis with results
        results.forEach((job, index) => {
          if (aiAnalysis && aiAnalysis[index]) {
            job.aiAnalysis = {
              isRelevant: aiAnalysis[index].isRelevant,
              confidence: aiAnalysis[index].confidence,
              reasoning: aiAnalysis[index].reasoning,
            };
          }
        });

        logger.success("✅ AI analysis completed");
      } else {
        logger.warning("⚠️ AI not available - skipping analysis");
      }
    }

    return {
      results,
      rejectedResults,
      metadata: {
        source: "skipthedrive",
        totalJobs: results.length,
        rejectedJobs: rejectedResults.length,
        keywords: keywords,
        jobTypes: jobTypes,
        locationFilter: locationFilter,
        aiAnalysisEnabled: enableAI,
        aiAnalysisCompleted: !!aiAnalysis,
        timestamp: new Date().toISOString(),
      },
    };
  } catch (error) {
    logger.error(`Fatal error in SkipTheDrive parser: ${error.message}`);
    throw error;
  } finally {
    await browser.close();
  }
}

// Export the parser
module.exports = {
  parseSkipTheDrive,
  buildSearchUrl,
  extractJobData,
};