- Introduced a new Indeed parsing strategy to support job extraction from Indeed, including advanced filtering options. - Updated job search parser to include Indeed in the site strategies, allowing for combined searches with other job sites. - Enhanced README documentation with detailed usage instructions for the Indeed parser, including examples for keyword and location filtering. - Improved logging for Indeed parsing to provide insights into job extraction processes and potential CAPTCHA handling.
87 lines
2.2 KiB
JavaScript
87 lines
2.2 KiB
JavaScript
const playwright = require('playwright');
|
|
const AuthManager = require('./auth-manager');
|
|
const NavigationManager = require('./navigation');
|
|
|
|
class CoreParser {
|
|
constructor(config = {}) {
|
|
this.config = {
|
|
headless: true,
|
|
timeout: 60000, // Increased default timeout
|
|
...config
|
|
};
|
|
this.browser = null;
|
|
this.context = null;
|
|
this.pages = {};
|
|
this.authManager = new AuthManager(this);
|
|
this.navigationManager = new NavigationManager(this);
|
|
}
|
|
|
|
async init() {
|
|
this.browser = await playwright.chromium.launch({
|
|
headless: this.config.headless
|
|
});
|
|
|
|
// Create context with user agent to appear more like a real browser
|
|
const contextOptions = {
|
|
userAgent: this.config.userAgent || 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
viewport: { width: 1920, height: 1080 },
|
|
locale: 'en-US',
|
|
timezoneId: 'America/New_York',
|
|
};
|
|
|
|
// Add extra HTTP headers to appear more legitimate
|
|
contextOptions.extraHTTPHeaders = {
|
|
'Accept-Language': 'en-US,en;q=0.9',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Encoding': 'gzip, deflate, br',
|
|
'DNT': '1',
|
|
'Connection': 'keep-alive',
|
|
'Upgrade-Insecure-Requests': '1',
|
|
};
|
|
|
|
this.context = await this.browser.newContext(contextOptions);
|
|
}
|
|
|
|
async createPage(id) {
|
|
if (!this.browser) await this.init();
|
|
const page = await this.context.newPage();
|
|
this.pages[id] = page;
|
|
return page;
|
|
}
|
|
|
|
getPage(id) {
|
|
return this.pages[id];
|
|
}
|
|
|
|
async authenticate(site, credentials, pageId) {
|
|
return this.authManager.authenticate(site, credentials, pageId);
|
|
}
|
|
|
|
async navigateTo(url, options = {}) {
|
|
const {
|
|
pageId = "default",
|
|
waitUntil = "networkidle", // Changed default to networkidle
|
|
retries = 1,
|
|
retryDelay = 2000,
|
|
timeout = this.config.timeout,
|
|
} = options;
|
|
|
|
return this.navigationManager.navigateTo(url, options);
|
|
}
|
|
|
|
async cleanup() {
|
|
if (this.browser) {
|
|
await this.browser.close();
|
|
this.browser = null;
|
|
this.context = null;
|
|
this.pages = {};
|
|
}
|
|
}
|
|
}
|
|
|
|
module.exports = CoreParser;
|
|
|
|
|
|
|
|
|