-
Notifications
You must be signed in to change notification settings - Fork 1
Expand file tree
/
Copy pathLinkedInScraper.py
More file actions
290 lines (251 loc) · 13.4 KB
/
Copy pathLinkedInScraper.py
File metadata and controls
290 lines (251 loc) · 13.4 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
# LinkedInScraper.py
import json
import logging
import os
import re
import time
from typing import Any, Callable, Dict, List, Optional
from bs4 import BeautifulSoup as bs
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.remote.webelement import WebElement
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
class LinkedInScraper:
"""
A class to scrape data from a LinkedIn profile.
It's designed to be fault-tolerant and efficient.
"""
def __init__(self, driver: WebDriver, save_to_file: bool = False):
self.driver = driver
self.save = save_to_file
self.wait = WebDriverWait(self.driver, 15)
def scrape(self, url: str) -> str:
"""
Public method to orchestrate the scraping of a single profile.
"""
try:
self._navigate_to_profile(url)
profile_soup = bs(self.driver.page_source, "lxml")
# --- Basic Information ---
name = self._get_name(profile_soup)
location = self._get_location(profile_soup)
logging.info(f"Scraping data for: {name}")
# --- Section Scraping ---
experience = self._scrape_section('experience', 'experiences', self._parse_experience_item)
education = self._scrape_section('education', 'education', self._parse_education_item)
volunteering = self._scrape_section('volunteering_experience', 'volunteer-experiences', self._parse_volunteer_item)
skills = self._scrape_section('skills', 'skills', self._parse_skill_item)
# --- Compile and Save/Return Output ---
output = {
"url": self.driver.current_url,
"name": name,
"location": location,
"experience": experience,
"education": education,
"volunteering": volunteering,
"skills": skills,
}
json_output = json.dumps(output, indent=4)
if self.save:
self._save_output_to_file(json_output)
return json_output
except Exception as e:
logging.error(f"An error occurred while scraping {url}: {e}")
return json.dumps({"error": str(e), "url": url})
def _navigate_to_profile(self, url: str) -> None:
"""Navigates to the profile URL and waits for a key element to load."""
logging.info(f"Navigating to {url}")
self.driver.get(url)
try:
# Wait for the main profile card to be present
self.wait.until(EC.presence_of_element_located((By.ID, 'profile-content')))
except TimeoutException:
logging.warning("Could not load main profile element. The page may be private or invalid.")
raise
def _get_name(self, soup: bs) -> Optional[str]:
try:
name_tag = soup.select_one('section div span a h1')
if name_tag:
name = name_tag.get_text(strip=True)
return name if name else None
except AttributeError:
logging.warning("Could not find name element.")
return None
def _get_location(self, soup: bs) -> Optional[str]:
try:
return soup.find("span", class_="text-body-small inline t-black--light break-words").get_text(strip=True)
except AttributeError:
logging.warning("Could not find location element.")
return None
def _scrape_section(self, section_id: str, details_suffix: str, parser: Callable[[WebElement], Dict]) -> List[Dict]:
"""Generic method to scrape a profile section."""
section_list = []
base_url = self.driver.current_url.split('?')[0].strip('/')
details_url = f"{base_url}/details/{details_suffix}/"
# Check if a "Show all" button exists by looking for the link
try:
if details_suffix == "skills":
show_all_button = self.wait.until(EC.presence_of_element_located((
By.XPATH,
"//div[contains(@class, 'pv-action')]//a[contains(@href, '/details/skills') and contains(@class, 'artdeco-button') and .//span[starts-with(normalize-space(.), 'Show all')]]"
)))
else:
# Static ID case
details_link_id = f"navigation-index-see-all-{details_suffix}"
show_all_button = self.wait.until(EC.presence_of_element_located((
By.ID,
details_link_id
)))
show_all_button.click()
logging.info(f"Found 'Show all' button for '{section_id}'. Navigating to details page.")
# Wait for the list on the details page to load
list_container_class = "scaffold-finite-scroll__content"
self.wait.until(EC.presence_of_element_located((By.CLASS_NAME, list_container_class)))
# add a scroll to ensure all items are loaded
self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(1) # Allow time for lazy loading
# Re-parse the soup after navigation
page_soup = bs(self.driver.page_source, "lxml")
container = page_soup.find("div", class_=list_container_class)
items = container.find_all("li", class_="pvs-list__paged-list-item", recursive=True) if container else []
logging.info(f"Found {len(items)} items in '{section_id}' section.")
logging.info(f"Scraping full list for '{section_id}' from details page.")
self.driver.back() # Go back to main profile page for next section
except TimeoutException:
# If "Show all" button isn't found, scrape from main profile
logging.info(f"No 'Show all' button for '{section_id}'. Scraping from main profile.")
page_soup = bs(self.driver.page_source, "lxml")
section_anchor = page_soup.find("div", id=section_id)
if not section_anchor:
logging.warning(f"Section '{section_id}' not found on the profile.")
return []
items = section_anchor.find_next("ul").find_all("li", recursive=False)
for item in items:
parsed_item = parser(item)
if parsed_item:
section_list.append(parsed_item)
return section_list
# --- INDIVIDUAL PARSERS ---
# These contain the most brittle logic and are easiest to fix when isolated.
# NOTE: Relies on `visually-hidden` spans for accessibility which is subject to change.
def _parse_experience_item(self, item: bs) -> Optional[Dict]:
try:
data = {"title": None, "company": None, "employmentType": None, "startDate": None, "endDate": None, "duration": None, "location": None}
# This selector is more specific and robust
title_el = item.select_one("div.display-flex.flex-row.justify-space-between span[aria-hidden='true']")
data["title"] = title_el.get_text(strip=True) if title_el else None
# Get all other metadata from a single parent
meta_elements = item.select("span.t-14.t-normal")
if len(meta_elements) > 0:
# Company & Employment Type (e.g., "Google · Full-time")
# remove span with 'visually-hidden' class before extracting text
company_clone = meta_elements[0].__copy__()
for vh in company_clone.find_all("span", class_="visually-hidden"):
vh.decompose()
company_text = company_clone.get_text(strip=True).split('·')
data["company"] = company_text[0].strip()
if len(company_text) > 1:
data["employmentType"] = company_text[1].strip()
if len(meta_elements) > 1:
# Duration & Dates (e.g., "Jan 2022 - Present · 1 yr 5 mos")
# Remove span with 'visually-hidden' class before extracting text
meta_clone = meta_elements[1].__copy__()
for vh in meta_clone.find_all("span", class_="visually-hidden"):
vh.decompose()
duration_text = meta_clone.get_text(strip=True).split('·')
dates = duration_text[0].strip().split('-')
data["startDate"] = dates[0].strip()
data["endDate"] = dates[1].strip() if len(dates) > 1 else "Present"
if len(duration_text) > 1:
data["duration"] = duration_text[1].strip()
if len(meta_elements) > 2:
# Location (e.g., "Mountain View, California, United States")
# Remove span with 'visually-hidden' class before extracting text
location_clone = meta_elements[2].__copy__()
for vh in location_clone.find_all("span", class_="visually-hidden"):
vh.decompose()
data["location"] = location_clone.get_text(strip=True).replace('·', '-').strip()
return data
except Exception as e:
logging.debug(f"Could not parse experience item: {e}")
return None
def _parse_education_item(self, item: bs) -> Optional[Dict]:
try:
data = {"school": None, "degree": None, "fieldOfStudy": None, "startDate": None, "endDate": None}
# School name
school_el = item.select_one("div.display-flex.flex-row.justify-space-between span[aria-hidden='true']")
data["school"] = school_el.get_text(strip=True) if school_el else None
# Degree & field of study
meta_elements = item.select("span.t-14.t-normal")
if len(meta_elements) > 0:
degree_clone = meta_elements[0].__copy__()
for vh in degree_clone.find_all("span", class_="visually-hidden"):
vh.decompose()
degree_text = degree_clone.get_text(strip=True).split(',')
data["degree"] = degree_text[0].strip()
if len(degree_text) > 1:
data["fieldOfStudy"] = degree_text[1].strip()
# Duration
if len(meta_elements) > 1:
duration_clone = meta_elements[1].__copy__()
for vh in duration_clone.find_all("span", class_="visually-hidden"):
vh.decompose()
duration_text = duration_clone.get_text(strip=True).split('-')
data["startDate"] = duration_text[0].strip()
if len(duration_text) > 1:
data["endDate"] = duration_text[1].strip()
return data
except Exception as e:
logging.debug(f"Could not parse education item: {e}")
return None
def _parse_volunteer_item(self, item: bs) -> Optional[Dict]:
try:
data = {"organization": None, "role": None, "startDate": None, "endDate": None, "duration": None}
# Role
role_el = item.select_one("div.display-flex.flex-row.justify-space-between span[aria-hidden='true']")
data["role"] = role_el.get_text(strip=True) if role_el else None
# Meta elements (organization, duration)
meta_elements = item.select("span.t-14.t-normal")
if len(meta_elements) > 0:
org_clone = meta_elements[0].__copy__()
for vh in org_clone.find_all("span", class_="visually-hidden"):
vh.decompose()
data["organization"] = org_clone.get_text(strip=True)
if len(meta_elements) > 1:
duration_clone = meta_elements[1].__copy__()
for vh in duration_clone.find_all("span", class_="visually-hidden"):
vh.decompose()
duration_text = duration_clone.get_text(strip=True).split('·')
dates = duration_text[0].strip().split('-')
data["startDate"] = dates[0].strip()
data["endDate"] = dates[1].strip() if len(dates) > 1 else "Present"
if len(duration_text) > 1:
data["duration"] = duration_text[1].strip()
return data
except Exception as e:
logging.debug(f"Could not parse volunteer item: {e}")
return None
def _parse_skill_item(self, item: bs) -> Optional[Dict]:
try:
skill_el = item.select_one("div.display-flex.flex-row.justify-space-between span[aria-hidden='true']")
return {"skill": skill_el.get_text(strip=True)} if skill_el else None
except Exception as e:
logging.debug(f"Could not parse skill item: {e}")
return None
def _save_output_to_file(self, json_output: str) -> None:
"""Saves the JSON output to a file."""
try:
# Extract a safe filename from the URL
filename = self.driver.current_url.split('/in/')[1].strip('/').replace('/', '_')
if not os.path.exists("./data"):
os.makedirs("data")
filepath = f"./data/{filename}.json"
with open(filepath, "w", encoding="utf-8") as f:
f.write(json_output)
logging.info(f"File saved to {filepath}")
except IndexError:
logging.error("Could not generate a filename from the URL.")
except Exception as e:
logging.error(f"Error saving file: {e}")