tanyar09 673f84d388 Add Indeed parsing strategy and enhance job search parser
- Introduced a new Indeed parsing strategy to support job extraction from Indeed, including advanced filtering options.
- Updated job search parser to include Indeed in the site strategies, allowing for combined searches with other job sites.
- Enhanced README documentation with detailed usage instructions for the Indeed parser, including examples for keyword and location filtering.
- Improved logging for Indeed parsing to provide insights into job extraction processes and potential CAPTCHA handling.
2025-12-18 14:01:06 -05:00

87 lines
2.2 KiB
JavaScript

const playwright = require('playwright');
const AuthManager = require('./auth-manager');
const NavigationManager = require('./navigation');
class CoreParser {
constructor(config = {}) {
this.config = {
headless: true,
timeout: 60000, // Increased default timeout
...config
};
this.browser = null;
this.context = null;
this.pages = {};
this.authManager = new AuthManager(this);
this.navigationManager = new NavigationManager(this);
}
async init() {
this.browser = await playwright.chromium.launch({
headless: this.config.headless
});
// Create context with user agent to appear more like a real browser
const contextOptions = {
userAgent: this.config.userAgent || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
viewport: { width: 1920, height: 1080 },
locale: 'en-US',
timezoneId: 'America/New_York',
};
// Add extra HTTP headers to appear more legitimate
contextOptions.extraHTTPHeaders = {
'Accept-Language': 'en-US,en;q=0.9',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'DNT': '1',
'Connection': 'keep-alive',
'Upgrade-Insecure-Requests': '1',
};
this.context = await this.browser.newContext(contextOptions);
}
async createPage(id) {
if (!this.browser) await this.init();
const page = await this.context.newPage();
this.pages[id] = page;
return page;
}
getPage(id) {
return this.pages[id];
}
async authenticate(site, credentials, pageId) {
return this.authManager.authenticate(site, credentials, pageId);
}
async navigateTo(url, options = {}) {
const {
pageId = "default",
waitUntil = "networkidle", // Changed default to networkidle
retries = 1,
retryDelay = 2000,
timeout = this.config.timeout,
} = options;
return this.navigationManager.navigateTo(url, options);
}
async cleanup() {
if (this.browser) {
await this.browser.close();
this.browser = null;
this.context = null;
this.pages = {};
}
}
}
module.exports = CoreParser;