-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathscraping.py
More file actions
158 lines (132 loc) · 5.28 KB
/
scraping.py
File metadata and controls
158 lines (132 loc) · 5.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
import time
import random
from bs4 import BeautifulSoup
from datetime import datetime
import requests
import pandas as pd
import logging
import csv
import json
logging.basicConfig(
filename='fresherworld_scraper.log',
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
BASE_URL = "https://www.freshersworld.com/python-jobs/3535127"
LIMIT = 20
TOTAL_PAGES = 214
def safe_text(parent, tag, class_name):
try:
element = parent.find(tag, class_=class_name)
return element.get_text(strip=True) if element else None
except AttributeError as e:
logging.error(f"safe_text error: {e}")
return None
def safe_attr(parent, tag, class_name, attr):
try:
element = parent.find(tag, class_=class_name)
return element.get(attr) if element else None
except AttributeError as e:
logging.error(f"safe_attr error: {e}")
return None
def fetch_page(session, url):
for attempt in range(3):
try:
response = session.get(url, timeout=10)
response.raise_for_status()
return response.text
except requests.RequestException as e:
logging.error(f"[Attempt {attempt+1}] Error fetching {url} → {e}")
time.sleep(2)
logging.error(f"❌ Failed after 3 attempts: {url}")
return None
def parse_jobs(html):
jobs_list = []
if html:
soup = BeautifulSoup(html, "lxml")
jobs = soup.find_all("div", class_="col-md-12 col-lg-12 col-xs-12 padding-none job-container jobs-on-hover top_space")
dates = soup.find_all("div", class_="text-ago")
for date, job in zip(dates, jobs):
try:
Role_name = safe_text(job, "span", "wrap-title seo_title")
company_name = safe_text(job, "h3", "latest-jobs-title font-16 margin-none inline-block company-name")
Location = safe_text(job, "a", "bold_font")
Experience = safe_text(job, "span", "experience job-details-span")
Salary = safe_text(job, "span", "qualifications display-block modal-open pull-left job-details-span")
Description = safe_text(job, "span", "desc")
Post_date = safe_text(date, "span", "ago-text")
job_link = job.get("job_display_url")
jobs_list.append({
"Role": Role_name,
"Company Name": company_name,
"Location": Location,
"Experience": Experience,
"Salary": Salary,
"Description": Description,
"Post Date": Post_date,
"Link": job_link
})
except Exception as e:
logging.warning(f"⚠️ Skipping a job due to unexpected error: {e}")
return jobs_list
def scrape_jobs(max_pages=TOTAL_PAGES, progress_callback=None):
logging.info("🟢 Scraper started")
print("Scraper started 🟢")
all_jobs = []
session = requests.Session()
session.headers.update({
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116 Safari/537.36"
})
# Ensure max_pages doesn't exceed TOTAL_PAGES if strict, but let's allow flexibility
limit_pages = min(max_pages, TOTAL_PAGES)
for page in range(limit_pages):
if progress_callback:
progress_callback(page + 1, limit_pages)
if page == 0:
url = BASE_URL
else:
url = f"{BASE_URL}?&limit={LIMIT}&offset={LIMIT * page}"
logging.info(f"Scraping page {page + 1} of {limit_pages}: {url}")
print(f"➡ Scraping page {page + 1}/{limit_pages}...")
html = fetch_page(session, url)
page_jobs = parse_jobs(html)
all_jobs.extend(page_jobs)
time.sleep(random.uniform(1.5, 3.5))
logging.info("🟢 Scraper finished")
print("Scraper finished 🟢")
return all_jobs
def save_csv(data):
try:
file_name = f"fresher_world_{datetime.now().strftime('%Y-%m-%d')}.csv"
with open(file_name, "w", encoding="utf-8", newline="") as file:
writer = csv.DictWriter(file, fieldnames=data[0].keys())
writer.writeheader()
writer.writerows(data)
logging.info(f"CSV saved: {file_name}")
print(f"✅ CSV saved: {file_name}")
except Exception as e:
logging.error(f"Error saving CSV: {e}")
def save_json(data):
try:
file_name = f"fresher_world_{datetime.now().strftime('%Y-%m-%d')}.json"
with open(file_name, "w", encoding="utf-8") as file:
json.dump(data, file, indent=4, ensure_ascii=False)
logging.info(f"JSON saved: {file_name}")
print(f"✅ JSON saved: {file_name}")
except Exception as e:
logging.error(f"Error saving JSON: {e}")
def save_excel(data):
if data:
filename = f"fresher_world_{datetime.now().strftime('%Y-%m-%d')}.xlsx"
df = pd.DataFrame(data)
df.to_excel(filename, index=False)
logging.info(f"Excel saved: {filename}")
print(f"✅ Excel saved: {filename}")
else:
logging.warning("No data to save.")
print("⚠️ No data to save.")
if __name__ == "__main__":
data = scrape_jobs()
save_excel(data)
save_csv(data)
save_json(data)