diff --git a/jobscraper/__init__.py b/jobscraper/__init__.py index cc59e2f..5417a8f 100644 --- a/jobscraper/__init__.py +++ b/jobscraper/__init__.py @@ -36,11 +36,13 @@ from jobscraper.glassdoor import GlassdoorScraper # noqa: E402 from jobscraper.indeed import IndeedScraper # noqa: E402 from jobscraper.linkedin import LinkedInScraper # noqa: E402 +from jobscraper.naukri import NaukriScraper # noqa: E402 SCRAPER_MAPPING: dict[Site, type] = { Site.INDEED: IndeedScraper, Site.GLASSDOOR: GlassdoorScraper, Site.LINKEDIN: LinkedInScraper, + Site.NAUKRI: NaukriScraper, } diff --git a/jobscraper/exception.py b/jobscraper/exception.py index 629c1f8..3c1fcab 100644 --- a/jobscraper/exception.py +++ b/jobscraper/exception.py @@ -29,4 +29,22 @@ def __init__(self, message: str | None = None): super().__init__(message or "An error occurred with LinkedIn") -# class NaukriException(Exception): pass # planned +class UpworkException(Exception): + """Raised when the Upwork scraper encounters an unrecoverable error.""" + + def __init__(self, message: str | None = None): + super().__init__(message or "An error occurred with Upwork") + + +class InternshalaException(Exception): + """Raised when the Internshala scraper encounters an unrecoverable error.""" + + def __init__(self, message: str | None = None): + super().__init__(message or "An error occurred with Internshala") + + +class NaukriException(Exception): + """Raised when the Naukri scraper encounters an unrecoverable error.""" + + def __init__(self, message: str | None = None): + super().__init__(message or "An error occurred with Naukri") diff --git a/jobscraper/model.py b/jobscraper/model.py index 7c0575d..1b25eed 100644 --- a/jobscraper/model.py +++ b/jobscraper/model.py @@ -20,11 +20,12 @@ class Site(str, Enum): INDEED = "indeed" GLASSDOOR = "glassdoor" LINKEDIN = "linkedin" - # NAUKRI = "naukri" # planned + UPWORK = "upwork" + INTERNSHALA_JOBS = "internshala_jobs" + INTERNSHALA_INTERNSHIPS = "internshala_internships" + NAUKRI = "naukri" # FOUNDIT = "foundit" # planned # SHINE = "shine" # planned - # INTERNSHALA = "internshala" # planned - # UPWORK = "upwork" # planned # APNA = "apna" # planned diff --git a/jobscraper/naukri/__init__.py b/jobscraper/naukri/__init__.py new file mode 100644 index 0000000..1d51e08 --- /dev/null +++ b/jobscraper/naukri/__init__.py @@ -0,0 +1,7 @@ +"""Naukri job scraper module.""" + +from __future__ import annotations + +from jobscraper.naukri._scraper import NaukriScraper + +__all__ = ["NaukriScraper"] diff --git a/jobscraper/naukri/_scraper.py b/jobscraper/naukri/_scraper.py new file mode 100644 index 0000000..4d4f11a --- /dev/null +++ b/jobscraper/naukri/_scraper.py @@ -0,0 +1,148 @@ +"""Naukri scraper implementation using headless browser.""" + +from __future__ import annotations + +import random +import time +from datetime import date +from typing import Any + +from jobscraper.exception import NaukriException +from jobscraper.model import JobPost, JobResponse, Scraper, ScraperInput, Site +from jobscraper.naukri.constant import NAUKRI_HEADERS, JOB_TYPE_MAP, SEARCH_URL +from jobscraper.naukri.util import ( + parse_compensation, + parse_location, + parse_search_html, +) +from jobscraper.util import create_logger, get_enum_from_job_type, markdown_converter + +logger = create_logger("naukri") + + +class NaukriScraper(Scraper): + """Scraper for naukri.com using headless browser (Playwright). + + Uses browser automation to render CSR-based page and extract job listings. + """ + + def scrape(self, scraper_input: ScraperInput) -> JobResponse: + """Fetch job listings from Naukri. + + Args: + scraper_input: Validated scraper config. + + Returns: + JobResponse with collected JobPost objects. + + Raises: + NaukriException: If browser launch fails or critical error occurs. + """ + try: + from playwright.sync_api import sync_playwright + except ImportError: + raise NaukriException( + "Playwright required for Naukri scraper. " + "Install: pip install playwright && playwright install" + ) + + jobs: list[JobPost] = [] + page = None + browser = None + + try: + with sync_playwright() as p: + browser = p.chromium.launch(headless=True) + # Bypass headless detection + page = browser.new_page( + viewport={"width": 1920, "height": 1080}, + user_agent=( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + ), + ) + + # Add headers to bypass detection + page.set_extra_http_headers({ + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + "Sec-Fetch-Dest": "document", + "Sec-Fetch-Mode": "navigate", + "Sec-Fetch-Site": "none", + "Sec-Fetch-User": "?1", + }) + + page_no = 1 + while len(jobs) < scraper_input.results_wanted: + try: + url = self._build_url( + scraper_input.search_term, + scraper_input.location, + page_no, + ) + logger.info(f"Fetching page {page_no}: {url}") + page.goto(url, wait_until="networkidle", timeout=30000) + time.sleep(random.uniform(0.5, 1.5)) + + html = page.content() + job_cards = parse_search_html(html) + + if not job_cards: + logger.info("No more jobs found") + break + + for card in job_cards: + if len(jobs) >= scraper_input.results_wanted: + break + + try: + job = self._card_to_jobpost(card) + jobs.append(job) + except Exception as e: + logger.warning(f"Failed to parse card {card.get('id')}: {e}") + continue + + page_no += 1 + time.sleep(random.uniform(0.5, 1.5)) + + except Exception as e: + logger.error(f"Error on page {page_no}: {e}") + break + + except NaukriException: + raise + except Exception as e: + raise NaukriException(f"Browser error: {e}") + finally: + if page: + page.close() + if browser: + browser.close() + + return JobResponse(jobs=jobs) + + def _build_url(self, search_term: str, location: str | None, page: int) -> str: + """Build Naukri search URL.""" + url = SEARCH_URL + params = [f"keyword={search_term}"] + if location: + params.append(f"location={location}") + params.append(f"pageNo={page}") + return f"{url}?{'&'.join(params)}" + + def _card_to_jobpost(self, card: dict[str, Any]) -> JobPost: + """Convert job card dict to JobPost.""" + return JobPost( + id=str(card.get("id", "")), + site=Site.NAUKRI, + job_url=card.get("job_url") or f"https://www.naukri.com/jobs/{card.get('id')}", + title=card.get("title") or "Unknown", + company=card.get("company"), + location=parse_location(card.get("location")), + compensation=parse_compensation(card.get("salary")), + job_type=[JOB_TYPE_MAP.get(card.get("job_type"), None)], + is_remote=None, + job_level=card.get("experience"), + description=None, + ) diff --git a/jobscraper/naukri/constant.py b/jobscraper/naukri/constant.py new file mode 100644 index 0000000..023b1f7 --- /dev/null +++ b/jobscraper/naukri/constant.py @@ -0,0 +1,29 @@ +"""Constants for the Naukri scraper.""" + +from __future__ import annotations + +from jobscraper.model import JobType + +BASE_URL = "https://www.naukri.com" +SEARCH_URL = BASE_URL + "/search-results" + +NAUKRI_HEADERS: dict[str, str] = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/120.0.0.0 Safari/537.36" + ), + "Accept": ( + "text/html,application/xhtml+xml,application/xml;" + "q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8" + ), + "Accept-Language": "en-US,en;q=0.9", + "Referer": BASE_URL + "/", +} + +JOB_TYPE_MAP: dict[str, JobType] = { + "Full Time": JobType.FULL_TIME, + "Part Time": JobType.PART_TIME, + "Contract": JobType.CONTRACT, + "Temporary": JobType.TEMPORARY, +} diff --git a/jobscraper/naukri/util.py b/jobscraper/naukri/util.py new file mode 100644 index 0000000..bedd192 --- /dev/null +++ b/jobscraper/naukri/util.py @@ -0,0 +1,146 @@ +"""Utility functions for the Naukri scraper.""" + +from __future__ import annotations + +import re +from bs4 import BeautifulSoup + +from jobscraper.model import Compensation, CompensationInterval, Location + + +def parse_location(raw: str | None) -> Location: + """Parse Naukri location string to Location model. + + All results India-based. + Handles: 'Bangalore', 'Bangalore, Karnataka', 'Remote', 'Work from Home'. + """ + if not raw or not raw.strip(): + return Location(country="India") + + raw = raw.strip() + + if raw.lower() in ("remote", "work from home", "pan india", "wfh"): + return Location(country="India") + + parts = [p.strip() for p in raw.split(",")] + if len(parts) == 1: + return Location(city=parts[0], country="India") + elif len(parts) == 2: + return Location(city=parts[0], state=parts[1], country="India") + else: + return Location(city=parts[0], country="India") + + +def parse_compensation(raw: str | None) -> Compensation | None: + """Parse compensation string from Naukri card. + + Handles: + - '₹4 - 7 LPA' → yearly INR (multiply by 100,000) + - '₹15,000 - ₹20,000 /month' → monthly INR + - '₹500 /day' → daily INR + - 'Not disclosed' → None + """ + if not raw or not raw.strip(): + return None + + raw = raw.strip() + + if any(phrase in raw.lower() for phrase in ["not disclosed", "confidential", "as per"]): + return None + + numbers = re.findall(r"[\d,]+(?:\.\d+)?", raw) + if not numbers: + return None + + try: + values = [float(n.replace(",", "")) for n in numbers] + except ValueError: + return None + + min_val = None + max_val = None + interval = CompensationInterval.YEARLY + + if len(values) >= 2: + min_val, max_val = values[0], values[1] + elif len(values) == 1: + min_val = values[0] + + if "/month" in raw.lower() or "per month" in raw.lower(): + interval = CompensationInterval.MONTHLY + elif "/day" in raw.lower() or "per day" in raw.lower(): + interval = CompensationInterval.DAILY + elif "/hour" in raw.lower() or "per hour" in raw.lower(): + interval = CompensationInterval.HOURLY + elif "LPA" in raw.upper(): + interval = CompensationInterval.YEARLY + if min_val: + min_val *= 100_000 + if max_val: + max_val *= 100_000 + else: + if min_val and min_val >= 100_000: + interval = CompensationInterval.YEARLY + else: + interval = CompensationInterval.MONTHLY + + return Compensation( + min_amount=min_val, + max_amount=max_val, + interval=interval, + currency="INR", + ) + + +def parse_search_html(html: str) -> list[dict]: + """Extract job postings from Naukri search results page.""" + soup = BeautifulSoup(html, "lxml") + jobs = [] + + for job_card in soup.find_all("article", class_="jobTuple"): + try: + job_id = job_card.get("data-jobid") or job_card.get("id") + if not job_id: + continue + + title_elem = job_card.find("a", class_="jobTitle") + title = title_elem.get_text(strip=True) if title_elem else None + + company_elem = job_card.find("a", class_="companyName") + company = company_elem.get_text(strip=True) if company_elem else None + + loc_elem = job_card.find("span", class_="locWc") + location = loc_elem.get_text(strip=True) if loc_elem else None + + exp_elem = job_card.find("span", class_="exp") + experience = exp_elem.get_text(strip=True) if exp_elem else None + + salary_elem = job_card.find("span", class_="sal") + salary = salary_elem.get_text(strip=True) if salary_elem else None + + job_type_elem = job_card.find("span", class_="jobType") + job_type = job_type_elem.get_text(strip=True) if job_type_elem else None + + job_url = None + link_elem = job_card.find("a", {"href": True}) + if link_elem: + href = link_elem.get("href", "") + if href and not href.startswith("http"): + job_url = "https://www.naukri.com" + href + else: + job_url = href + + jobs.append({ + "id": job_id, + "title": title, + "company": company, + "location": location, + "experience": experience, + "salary": salary, + "job_type": job_type, + "job_url": job_url, + }) + except Exception: + continue + + return jobs diff --git a/pyproject.toml b/pyproject.toml index 0dc3b22..4f81a5f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -31,9 +31,13 @@ dependencies = [ ] [project.optional-dependencies] +browser = [ + "playwright>=1.40.0", +] test = [ "pytest>=7.4.0", "pytest-cov>=4.1.0", + "playwright>=1.40.0", ] lint = [ "black>=24.1.0", @@ -48,6 +52,7 @@ dev = [ "pre-commit>=3.5.0", "pytest>=7.4.0", "pytest-cov>=4.1.0", + "playwright>=1.40.0", ] [build-system] diff --git a/tests/integration/test_naukri_integration.py b/tests/integration/test_naukri_integration.py new file mode 100644 index 0000000..089bd71 --- /dev/null +++ b/tests/integration/test_naukri_integration.py @@ -0,0 +1,41 @@ +"""Integration tests for Naukri scraper (live site). + +Note: Naukri.com uses Akamai WAF which blocks headless browser automation. +These tests are expected to fail unless run from a residential IP or with proxy. +""" + +import pytest +from jobscraper.model import ScraperInput, Site +from jobscraper.naukri import NaukriScraper + + +@pytest.mark.integration +@pytest.mark.xfail(strict=False, reason="Naukri WAF blocks headless browser (Akamai)") +def test_naukri_scraper_live_search(): + """Test live Naukri search.""" + scraper = NaukriScraper() + input_data = ScraperInput( + site_name=[Site.NAUKRI], + search_term="python", + location="Bangalore", + results_wanted=5, + ) + response = scraper.scrape(input_data) + assert response.jobs is not None + # This assertion will fail due to WAF + assert len(response.jobs) > 0 + + +@pytest.mark.integration +@pytest.mark.xfail(strict=False, reason="Naukri WAF blocks headless browser") +def test_naukri_scraper_remote_jobs(): + """Test Naukri remote job search.""" + scraper = NaukriScraper() + input_data = ScraperInput( + site_name=[Site.NAUKRI], + search_term="data scientist", + location="Remote", + results_wanted=5, + ) + response = scraper.scrape(input_data) + assert response.jobs is not None diff --git a/tests/test_naukri.py b/tests/test_naukri.py new file mode 100644 index 0000000..8ef1448 --- /dev/null +++ b/tests/test_naukri.py @@ -0,0 +1,103 @@ +"""Unit tests for Naukri scraper parsing utilities.""" + +import pytest +from jobscraper.naukri.util import parse_location, parse_compensation, parse_search_html +from jobscraper.model import CompensationInterval + + +class TestParseLocation: + """Test location parsing.""" + + def test_city_only(self): + result = parse_location("Bangalore") + assert result.city == "Bangalore" + assert result.country == "India" + + def test_city_state(self): + result = parse_location("Bangalore, Karnataka") + assert result.city == "Bangalore" + assert result.state == "Karnataka" + assert result.country == "India" + + def test_remote_variations(self): + for remote_text in ["Remote", "Work from Home", "WFH", "Pan India"]: + result = parse_location(remote_text) + assert result.country == "India" + assert result.city is None + + def test_empty(self): + result = parse_location(None) + assert result.country == "India" + assert result.city is None + + +class TestParseCompensation: + """Test compensation parsing.""" + + def test_lpa_range(self): + result = parse_compensation("₹4 - 7 LPA") + assert result is not None + assert result.min_amount == 400_000 + assert result.max_amount == 700_000 + assert result.interval == CompensationInterval.YEARLY + assert result.currency == "INR" + + def test_monthly_range(self): + result = parse_compensation("₹15,000 - ₹20,000 /month") + assert result is not None + assert result.min_amount == 15_000 + assert result.max_amount == 20_000 + assert result.interval == CompensationInterval.MONTHLY + + def test_daily_rate(self): + result = parse_compensation("₹500 /day") + assert result is not None + assert result.min_amount == 500 + assert result.interval == CompensationInterval.DAILY + + def test_not_disclosed(self): + result = parse_compensation("Not disclosed") + assert result is None + + def test_empty(self): + result = parse_compensation(None) + assert result is None + + +class TestParseSearchHTML: + """Test HTML parsing with mock Naukri HTML.""" + + def test_parse_single_job_card(self): + html = """ + +
+ Python Developer + Tech Corp + Bangalore + 3 - 5 years + ₹8 - 12 LPA + Full Time +
+ + """ + jobs = parse_search_html(html) + assert len(jobs) == 1 + assert jobs[0]["id"] == "12345" + assert jobs[0]["title"] == "Python Developer" + assert jobs[0]["company"] == "Tech Corp" + assert jobs[0]["location"] == "Bangalore" + + def test_parse_multiple_cards(self): + html = """ + +
Job 1
+
Job 2
+ + """ + jobs = parse_search_html(html) + assert len(jobs) == 2 + + def test_parse_empty_html(self): + html = "" + jobs = parse_search_html(html) + assert len(jobs) == 0