linkedout/location-utils.js
2025-07-03 21:41:56 -04:00

1127 lines
23 KiB
JavaScript

/**
* Enhanced Location Filtering Utilities - Improved Version
*
* Place all keyword CSVs in the keywords/ folder for use with LinkedOut.
*
* These utilities provide:
* - Comprehensive city/province lookup for Canada
* - Fast O(1) city-to-province matching
* - Flexible location filter parsing and validation
* - Used by linkedout.js for profile location validation
*
* USAGE (for developers):
* const { parseLocationFilters, validateLocationAgainstFilters, extractLocationFromProfile } = require('./location-utils');
*
* See linkedout.js for integration details.
*/
// Suppress D-Bus notification errors in WSL
process.env.NO_AT_BRIDGE = "1";
process.env.DBUS_SESSION_BUS_ADDRESS = "/dev/null";
// Organized by province with comprehensive coverage
const CITIES_BY_PROVINCE = {
ontario: [
// Greater Toronto Area
"toronto",
"mississauga",
"brampton",
"markham",
"vaughan",
"richmond hill",
"oakville",
"burlington",
"pickering",
"ajax",
"whitby",
"oshawa",
"milton",
"newmarket",
"aurora",
"georgina",
"king",
"whitchurch-stouffville",
"caledon",
"halton hills",
"clarington",
"scugog",
"uxbridge",
// Southwestern Ontario
"london",
"windsor",
"kitchener",
"waterloo",
"cambridge",
"guelph",
"brantford",
"woodstock",
"stratford",
"sarnia",
"chatham",
"leamington",
"kingsville",
"amherstburg",
"tecumseh",
"lakeshore",
"essex",
"tilbury",
"st. thomas",
"ingersoll",
"tillsonburg",
"simcoe",
"delhi",
"port dover",
"welland",
"niagara falls",
"st. catharines",
"thorold",
"fort erie",
"grimsby",
"lincoln",
"pelham",
"wainfleet",
"west lincoln",
// Central Ontario
"hamilton",
"barrie",
"orillia",
"midland",
"penetanguishene",
"collingwood",
"wasaga beach",
"blue mountains",
"clearview",
"springwater",
"innisfil",
"bradford west gwillimbury",
"essa",
"new tecumseth",
"adjala-tosorontio",
"mono",
"orangeville",
"shelburne",
"mulmur",
"amaranth",
"east garafraxa",
// Eastern Ontario
"ottawa",
"gatineau",
"kingston",
"cornwall",
"pembroke",
"petawawa",
"deep river",
"arnprior",
"renfrew",
"carleton place",
"almonte",
"smiths falls",
"perth",
"brockville",
"prescott",
"iroquois",
"morrisburg",
"winchester",
"kemptville",
"merrickville-wolford",
"westport",
"gananoque",
"lansdowne",
"belleville",
"trenton",
"picton",
"napanee",
"deseronto",
"quinte west",
// Northern Ontario
"sudbury",
"north bay",
"sault ste. marie",
"thunder bay",
"timmins",
"kirkland lake",
"cochrane",
"kapuskasing",
"hearst",
"iroquois falls",
"smooth rock falls",
"matheson",
"new liskeard",
"haileybury",
"cobalt",
"temiskaming shores",
"englehart",
"elliot lake",
"espanola",
"blind river",
"spanish",
"massey",
"thessalon",
"wawa",
"chapleau",
"white river",
"marathon",
"terrace bay",
"schreiber",
"nipigon",
"red rock",
"geraldton",
"longlac",
"beardmore",
"greenstone",
"ignace",
"dryden",
"kenora",
"fort frances",
"atikokan",
"rainy river",
"emo",
"sioux lookout",
"pickle lake",
"red lake",
// Additional mid-size communities
"cobourg",
"port hope",
"peterborough",
"lindsay",
"fenelon falls",
"bobcaygeon",
"minden",
"haliburton",
"bancroft",
"barry's bay",
"huntsville",
"bracebridge",
"gravenhurst",
"parry sound",
"burk's falls",
"powassan",
"callander",
"sturgeon falls",
"west nipissing",
"french river",
"killarney",
"gore bay",
"little current",
"mindemoya",
"wikwemikong",
"m'chigeeng",
"aundeck omni kaning",
],
manitoba: [
"winnipeg",
"brandon",
"steinbach",
"thompson",
"portage la prairie",
"winkler",
"selkirk",
"morden",
"dauphin",
"the pas",
"flin flon",
"swan river",
"neepawa",
"virden",
"souris",
"carman",
"stonewall",
"beausejour",
"gimli",
"arborg",
"teulon",
"ashern",
"eriksdale",
"fisher branch",
"riverton",
"winnipeg beach",
"dunnottar",
"altona",
"morris",
"emerson",
"killarney",
"boissevain",
"deloraine",
"melita",
"waskada",
"cartwright",
"crystal city",
"pilot mound",
"manitou",
"la riviere",
"glenboro",
"treherne",
"holland",
"hamiota",
"shoal lake",
"russell",
"roblin",
"grandview",
"minitonas",
"bowsman",
"birtle",
"rossburn",
"sandy lake",
],
"british columbia": [
"vancouver",
"surrey",
"burnaby",
"richmond",
"abbotsford",
"coquitlam",
"langley",
"delta",
"north vancouver",
"west vancouver",
"new westminster",
"port coquitlam",
"maple ridge",
"white rock",
"pitt meadows",
"port moody",
"bowen island",
"anmore",
"belcarra",
"lions bay",
"victoria",
"saanich",
"esquimalt",
"oak bay",
"view royal",
"sidney",
"central saanich",
"north saanich",
"highlands",
"metchosin",
"sooke",
"colwood",
"langford",
"duncan",
"nanaimo",
"parksville",
"qualicum beach",
"courtenay",
"comox",
"campbell river",
"port alberni",
"tofino",
"ucluelet",
"kelowna",
"vernon",
"penticton",
"kamloops",
"salmon arm",
"revelstoke",
"golden",
"invermere",
"cranbrook",
"fernie",
"kimberley",
"nelson",
"castlegar",
"trail",
"rossland",
"grand forks",
"osoyoos",
"oliver",
"summerland",
"peachland",
"westbank",
"prince george",
"quesnel",
"williams lake",
"100 mile house",
"clinton",
"cache creek",
"ashcroft",
"merritt",
"princeton",
"hope",
"chilliwack",
"mission",
"harrison hot springs",
"agassiz",
"kent",
"fraser valley",
"squamish",
"whistler",
"pemberton",
"lillooet",
"lytton",
"prince rupert",
"terrace",
"kitimat",
"smithers",
"burns lake",
"vanderhoof",
"fort st. john",
"dawson creek",
"tumbler ridge",
"chetwynd",
"hudson's hope",
"fort nelson",
"fort st. james",
],
alberta: [
"calgary",
"edmonton",
"red deer",
"lethbridge",
"medicine hat",
"grande prairie",
"airdrie",
"spruce grove",
"leduc",
"lloydminster",
"camrose",
"wetaskiwin",
"lacombe",
"ponoka",
"sylvan lake",
"blackfalds",
"innisfail",
"olds",
"didsbury",
"carstairs",
"cochrane",
"canmore",
"banff",
"okotoks",
"high river",
"strathmore",
"chestermere",
"drumheller",
"three hills",
"hanna",
"oyen",
"consort",
"provost",
"wainwright",
"vermilion",
"lloydminster",
"bonnyville",
"cold lake",
"st. paul",
"two hills",
"vegreville",
"mundare",
"lamont",
"bruderheim",
"morinville",
"legal",
"bon accord",
"gibbons",
"redwater",
"smoky lake",
"willingdon",
"andrew",
"chipman",
"fort saskatchewan",
"sherwood park",
"beaumont",
"devon",
"calmar",
"thorsby",
"warburg",
"breton",
"winfield",
"drayton valley",
"rocky mountain house",
"sundre",
"caroline",
"rimbey",
"bentley",
"blackfalds",
"penhold",
"bowden",
"eckville",
"rocky mountain house",
"sundre",
"olds",
"fort mcmurray",
"slave lake",
"high prairie",
"valleyview",
"fox creek",
"whitecourt",
"mayerthorpe",
"barrhead",
"westlock",
"athabasca",
"boyle",
"newbrook",
"wandering river",
"peace river",
"grimshaw",
"manning",
"fairview",
"high level",
"rainbow lake",
"zama city",
],
quebec: [
"montreal",
"quebec city",
"laval",
"gatineau",
"longueuil",
"sherbrooke",
"saguenay",
"levis",
"trois-rivieres",
"terrebonne",
"saint-jean-sur-richelieu",
"repentigny",
"brossard",
"drummondville",
"saint-jerome",
"granby",
"blainville",
"saint-hyacinthe",
"shawinigan",
"dollard-des-ormeaux",
"rimouski",
"sorel-tracy",
"victoriaville",
"saint-eustache",
"vaudreuil-dorion",
"val-d'or",
"salaberry-de-valleyfield",
"sept-iles",
"rouyn-noranda",
"thetford mines",
"alma",
"joliette",
"saint-georges",
"baie-comeau",
"mascouche",
"beloeil",
"chateauguay",
"saint-constant",
"sainte-catherine",
"saint-bruno-de-montarville",
"boucherville",
"saint-lambert",
"candiac",
"la prairie",
"saint-basile-le-grand",
"carignan",
"chambly",
"saint-mathieu-de-beloeil",
],
saskatchewan: [
"saskatoon",
"regina",
"prince albert",
"moose jaw",
"swift current",
"yorkton",
"north battleford",
"estevan",
"weyburn",
"lloydminster",
"martensville",
"warman",
"humboldt",
"kindersley",
"melville",
"tisdale",
"nipawin",
"melfort",
"unity",
"biggar",
"rosetown",
"outlook",
"davidson",
"watrous",
"lanigan",
"wynyard",
"foam lake",
"canora",
"preeceville",
"kamsack",
"roblin",
"hudson bay",
"carrot river",
"white fox",
"spiritwood",
"maidstone",
"lashburn",
"cut knife",
"wilkie",
"macklin",
"luseland",
"kerrobert",
"kindersley",
"eston",
"elrose",
"alsask",
"leader",
"maple creek",
"shaunavon",
"gull lake",
"cabri",
"kyle",
"rosetown",
"kindersley",
],
"nova scotia": [
"halifax",
"dartmouth",
"sydney",
"truro",
"new glasgow",
"glace bay",
"yarmouth",
"bridgewater",
"kentville",
"amherst",
"new waterford",
"sydney mines",
"antigonish",
"stellarton",
"westville",
"pictou",
"digby",
"windsor",
"wolfville",
"middleton",
"annapolis royal",
"liverpool",
"shelburne",
"lockeport",
"lunenburg",
"mahone bay",
"chester",
"hubbards",
"tantallon",
"fall river",
"beaver bank",
"sackville",
"bedford",
"cole harbour",
"eastern passage",
"porters lake",
"musquodoboit harbour",
"sheet harbour",
"stewiacke",
"shubenacadie",
"elmsdale",
"enfield",
"lantz",
"milford",
"gay's river",
"mount uniacke",
"nine mile river",
],
"new brunswick": [
"saint john",
"moncton",
"fredericton",
"dieppe",
"riverview",
"miramichi",
"edmundston",
"campbellton",
"bathurst",
"sackville",
"sussex",
"hampton",
"quispamsis",
"rothesay",
"grand bay-westfield",
"st. stephen",
"st. andrews",
"blacks harbour",
"grand manan",
"deer island",
"campobello island",
"woodstock",
"hartland",
"florenceville-bristol",
"perth-andover",
"grand falls",
"plaster rock",
"tobique first nation",
"nackawic",
"mcadam",
"harvey",
"chipman",
"minto",
"gagetown",
"oromocto",
"new maryland",
"hanwell",
"kingsclear",
"stanley",
"doaktown",
"blackville",
"renous",
"boiestown",
"caraquet",
"shippagan",
"tracadie",
"neguac",
"rogersville",
"rexton",
"richibucto",
"bouctouche",
"shediac",
"cap-pele",
"beaubassin-est",
],
"newfoundland and labrador": [
"st. johns",
"mount pearl",
"corner brook",
"conception bay south",
"paradise",
"grand falls-windsor",
"happy valley-goose bay",
"gander",
"carbonear",
"stephenville",
"bay roberts",
"clarenville",
"marystown",
"deer lake",
"channel-port aux basques",
"labrador city",
"wabana",
"holyrood",
"portugal cove-st. philips",
"torbay",
"pouch cove",
"flatrock",
"logy bay-middle cove-outer cove",
"petty harbour-maddox cove",
"bauline",
"witless bay",
"ferryland",
"aquaforte",
"renews-cappahayden",
"trepassey",
"branch",
"placentia",
"come by chance",
"sunnyside",
"whitbourne",
"chapel arm",
"bluewater",
"norman's cove-long cove",
"heart's content",
"heart's delight-islington",
"cavendish",
"new melbourne",
"whiteway",
"trinity",
"bonavista",
],
"prince edward island": [
"charlottetown",
"summerside",
"stratford",
"cornwall",
"montague",
"souris",
"kensington",
"alberton",
"tignish",
"o'leary",
"wellington",
"borden-carleton",
"murray river",
"georgetown",
"crapaud",
"breadalbane",
"hunter river",
"new london",
"cavendish",
"stanley bridge",
"rustico",
"brackley",
"winsloe",
"york",
"tea hill",
"miltonvale park",
"sherwood",
"warren grove",
"clyde river",
"bonshaw",
"vernon bridge",
"orwell",
"wood islands",
"belle river",
"murray harbour",
"little sands",
"gladstone",
"annandale",
"montague",
"brudenell",
"cardigan",
"launching",
"pooles corner",
"morell",
"st. peters",
"red point",
"lakeville",
"souris west",
],
"northwest territories": [
"yellowknife",
"hay river",
"inuvik",
"fort simpson",
"fort smith",
"norman wells",
"iqaluit",
"rankin inlet",
"arviat",
"baker lake",
"cambridge bay",
"gjoa haven",
"kugluktuk",
"taloyoak",
"fort mcpherson",
"aklavik",
"tuktoyaktuk",
"paulatuk",
"sachs harbour",
"ulukhaktok",
"tsiigehtchic",
"fort good hope",
"colville lake",
"tulita",
"deline",
"wrigley",
"nahanni butte",
"jean marie river",
"kakisa",
"enterprise",
"fort resolution",
"lutselk'e",
"gameti",
"wekweeti",
"whati",
"behchoko",
],
yukon: [
"whitehorse",
"dawson city",
"watson lake",
"haines junction",
"carmacks",
"mayo",
"faro",
"ross river",
"teslin",
"carcross",
"tagish",
"marsh lake",
"ibex valley",
"mount lorne",
"granger",
"takhini",
"fish lake",
"mendenhall",
"pelly crossing",
"stewart crossing",
"beaver creek",
"destruction bay",
"burwash landing",
"kluane lake",
"silver city",
"champagne",
"old crow",
"eagle plains",
"fort mcpherson",
],
nunavut: [
"iqaluit",
"rankin inlet",
"arviat",
"baker lake",
"cambridge bay",
"gjoa haven",
"kugluktuk",
"taloyoak",
"kugaaruk",
"igloolik",
"hall beach",
"pond inlet",
"arctic bay",
"clyde river",
"pangnirtung",
"cape dorset",
"kimmirut",
"sanikiluaq",
"whale cove",
"chesterfield inlet",
"coral harbour",
"naujaat",
"igloolik",
"sanirajak",
"grise fiord",
"resolute",
"alert",
"eureka",
],
};
// Create reverse lookup for faster searching
const CITY_TO_PROVINCE = {};
for (const [province, cities] of Object.entries(CITIES_BY_PROVINCE)) {
for (const city of cities) {
CITY_TO_PROVINCE[city.toLowerCase()] = province.toLowerCase();
}
}
// Province name variations and abbreviations (unchanged)
const PROVINCE_VARIATIONS = {
ontario: ["ontario", "ont", "on"],
manitoba: ["manitoba", "man", "mb"],
"british columbia": ["british columbia", "bc", "b.c."],
alberta: ["alberta", "alta", "ab"],
quebec: ["quebec", "que", "qc", "québec"],
saskatchewan: ["saskatchewan", "sask", "sk"],
"nova scotia": ["nova scotia", "ns", "n.s."],
"new brunswick": ["new brunswick", "nb", "n.b."],
"newfoundland and labrador": [
"newfoundland and labrador",
"nl",
"n.l.",
"newfoundland",
"nfld",
],
"prince edward island": ["prince edward island", "pei", "p.e.i."],
"northwest territories": ["northwest territories", "nt", "n.w.t.", "nwt"],
yukon: ["yukon", "yt", "y.t."],
nunavut: ["nunavut", "nu", "nvt"],
};
/**
* Parse location filters from environment variable
* Supports multiple formats:
* - Single: "Ontario"
* - Multiple: "Ontario,Manitoba" or "Ontario|Manitoba"
* - Mixed: "Toronto,Ontario,Vancouver"
*/
function parseLocationFilters(locationFilterString) {
if (!locationFilterString) return [];
// Split by comma or pipe
const filters = locationFilterString
.split(/[,|]/)
.map((f) => f.trim().toLowerCase());
return filters.filter((f) => f.length > 0);
}
/**
* Enhanced location validation with comprehensive city coverage
* @param {string} userLocation - User's location from LinkedIn profile
* @param {string[]} locationFilters - Array of location filters
* @returns {Object} - {isValid: boolean, matchedFilter: string, reasoning: string}
*/
function validateLocationAgainstFilters(userLocation, locationFilters) {
if (!userLocation || locationFilters.length === 0) {
return {
isValid: true,
matchedFilter: null,
reasoning: "No filtering applied",
};
}
const normalizedLocation = userLocation.toLowerCase();
// Check each filter
for (const filter of locationFilters) {
const normalizedFilter = filter.toLowerCase();
// 1. Direct string match with word boundaries
const filterRegex = new RegExp(
`\\b${normalizedFilter.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&")}\\b`,
"i"
);
if (filterRegex.test(normalizedLocation)) {
return {
isValid: true,
matchedFilter: filter,
reasoning: `Direct match: "${normalizedFilter}" found in "${userLocation}"`,
};
}
// 2. Check if filter is a province - look for cities in that province
const provinceVariations = PROVINCE_VARIATIONS[normalizedFilter] || [];
for (const variation of provinceVariations) {
const variationRegex = new RegExp(
`\\b${variation.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&")}\\b`,
"i"
);
if (variationRegex.test(normalizedLocation)) {
return {
isValid: true,
matchedFilter: filter,
reasoning: `Province match: "${variation}" found in "${userLocation}"`,
};
}
}
// 3. Check if any city in the location maps to the filtered province
for (const [city, province] of Object.entries(CITY_TO_PROVINCE)) {
// Use word boundary regex to match city as a whole word
const cityRegex = new RegExp(
`\\b${city.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&")}\\b`,
"i"
);
if (cityRegex.test(normalizedLocation) && province === normalizedFilter) {
return {
isValid: true,
matchedFilter: filter,
reasoning: `City-to-province match: "${city}" maps to "${province}"`,
};
}
}
// 4. Check if filter is a city and maps to a province mentioned in location
const mappedProvince = CITY_TO_PROVINCE[normalizedFilter];
if (mappedProvince) {
const provinceVariations = PROVINCE_VARIATIONS[mappedProvince] || [];
for (const variation of provinceVariations) {
const variationRegex = new RegExp(
`\\b${variation.replace(/[-/\\^$*+?.()|[\]{}]/g, "\\$&")}\\b`,
"i"
);
if (variationRegex.test(normalizedLocation)) {
return {
isValid: true,
matchedFilter: filter,
reasoning: `Reverse city match: "${filter}" is in "${mappedProvince}" which matches location`,
};
}
}
}
// 5. Partial city name matching (for areas like "Greater Toronto Area")
const words = normalizedLocation.split(/[\s,.-]+/);
for (const word of words) {
if (word.length > 3) {
// Avoid matching short words
// Use word boundary regex to match word as a whole city name
const mappedProvince = CITY_TO_PROVINCE[word];
if (mappedProvince === normalizedFilter) {
return {
isValid: true,
matchedFilter: filter,
reasoning: `Partial city match: "${word}" from "${userLocation}" maps to "${normalizedFilter}"`,
};
}
}
}
}
return {
isValid: false,
matchedFilter: null,
reasoning: `Location "${userLocation}" does not match any of: ${locationFilters.join(
", "
)}`,
};
}
/**
* Extract location from LinkedIn profile with improved selectors
* @param {Object} page - Playwright page object
* @returns {Promise<string>} - Extracted location or empty string
*/
async function extractLocationFromProfile(page) {
// Enhanced selectors for location information
const locationSelectors = [
// Primary location selectors
".text-body-small.inline.t-black--light.break-words",
".pv-text-details__left-panel .text-body-small",
".pb2.pv-text-details__left-panel",
".text-body-small.inline",
'[data-field="location_details"]',
// Additional selectors for different LinkedIn layouts
".pv-text-details__left-panel-item",
".pv-entity__location",
".pv-top-card__location",
".pv-top-card--list-bullet .pv-top-card--list-bullet-item",
".artdeco-entity-lockup__subtitle",
// Mobile/responsive selectors
".profile-topcard__location",
".profile-topcard__location-data",
];
for (const selector of locationSelectors) {
try {
const elements = await page.$$(selector);
for (const element of elements) {
const text = await element.textContent();
if (text && text.trim()) {
const cleanText = text.trim();
// Accept locations with OR without commas
// Common patterns: "Toronto, ON", "Toronto", "Toronto, Ontario, Canada"
if (
cleanText.length > 2 &&
(cleanText.includes(",") || /^[a-zA-Z\s.-]+$/.test(cleanText)) &&
!cleanText.toLowerCase().includes("connection") &&
!cleanText.toLowerCase().includes("follower") &&
!cleanText.toLowerCase().includes("experience") &&
cleanText.length < 100
) {
return cleanText;
}
}
}
} catch (e) {
// Continue to next selector
}
}
return "";
}
/**
* Get statistics about city coverage
*/
function getCoverageStats() {
const stats = {};
for (const [province, cities] of Object.entries(CITIES_BY_PROVINCE)) {
stats[province] = cities.length;
}
stats.total = Object.keys(CITY_TO_PROVINCE).length;
return stats;
}
module.exports = {
parseLocationFilters,
validateLocationAgainstFilters,
extractLocationFromProfile,
CITY_TO_PROVINCE,
CITIES_BY_PROVINCE,
PROVINCE_VARIATIONS,
getCoverageStats,
};