diff --git a/jobscraper/__init__.py b/jobscraper/__init__.py
index cc59e2f..5417a8f 100644
--- a/jobscraper/__init__.py
+++ b/jobscraper/__init__.py
@@ -36,11 +36,13 @@
from jobscraper.glassdoor import GlassdoorScraper # noqa: E402
from jobscraper.indeed import IndeedScraper # noqa: E402
from jobscraper.linkedin import LinkedInScraper # noqa: E402
+from jobscraper.naukri import NaukriScraper # noqa: E402
SCRAPER_MAPPING: dict[Site, type] = {
Site.INDEED: IndeedScraper,
Site.GLASSDOOR: GlassdoorScraper,
Site.LINKEDIN: LinkedInScraper,
+ Site.NAUKRI: NaukriScraper,
}
diff --git a/jobscraper/exception.py b/jobscraper/exception.py
index 629c1f8..3c1fcab 100644
--- a/jobscraper/exception.py
+++ b/jobscraper/exception.py
@@ -29,4 +29,22 @@ def __init__(self, message: str | None = None):
super().__init__(message or "An error occurred with LinkedIn")
-# class NaukriException(Exception): pass # planned
+class UpworkException(Exception):
+ """Raised when the Upwork scraper encounters an unrecoverable error."""
+
+ def __init__(self, message: str | None = None):
+ super().__init__(message or "An error occurred with Upwork")
+
+
+class InternshalaException(Exception):
+ """Raised when the Internshala scraper encounters an unrecoverable error."""
+
+ def __init__(self, message: str | None = None):
+ super().__init__(message or "An error occurred with Internshala")
+
+
+class NaukriException(Exception):
+ """Raised when the Naukri scraper encounters an unrecoverable error."""
+
+ def __init__(self, message: str | None = None):
+ super().__init__(message or "An error occurred with Naukri")
diff --git a/jobscraper/model.py b/jobscraper/model.py
index 7c0575d..1b25eed 100644
--- a/jobscraper/model.py
+++ b/jobscraper/model.py
@@ -20,11 +20,12 @@ class Site(str, Enum):
INDEED = "indeed"
GLASSDOOR = "glassdoor"
LINKEDIN = "linkedin"
- # NAUKRI = "naukri" # planned
+ UPWORK = "upwork"
+ INTERNSHALA_JOBS = "internshala_jobs"
+ INTERNSHALA_INTERNSHIPS = "internshala_internships"
+ NAUKRI = "naukri"
# FOUNDIT = "foundit" # planned
# SHINE = "shine" # planned
- # INTERNSHALA = "internshala" # planned
- # UPWORK = "upwork" # planned
# APNA = "apna" # planned
diff --git a/jobscraper/naukri/__init__.py b/jobscraper/naukri/__init__.py
new file mode 100644
index 0000000..1d51e08
--- /dev/null
+++ b/jobscraper/naukri/__init__.py
@@ -0,0 +1,7 @@
+"""Naukri job scraper module."""
+
+from __future__ import annotations
+
+from jobscraper.naukri._scraper import NaukriScraper
+
+__all__ = ["NaukriScraper"]
diff --git a/jobscraper/naukri/_scraper.py b/jobscraper/naukri/_scraper.py
new file mode 100644
index 0000000..4d4f11a
--- /dev/null
+++ b/jobscraper/naukri/_scraper.py
@@ -0,0 +1,148 @@
+"""Naukri scraper implementation using headless browser."""
+
+from __future__ import annotations
+
+import random
+import time
+from datetime import date
+from typing import Any
+
+from jobscraper.exception import NaukriException
+from jobscraper.model import JobPost, JobResponse, Scraper, ScraperInput, Site
+from jobscraper.naukri.constant import NAUKRI_HEADERS, JOB_TYPE_MAP, SEARCH_URL
+from jobscraper.naukri.util import (
+ parse_compensation,
+ parse_location,
+ parse_search_html,
+)
+from jobscraper.util import create_logger, get_enum_from_job_type, markdown_converter
+
+logger = create_logger("naukri")
+
+
+class NaukriScraper(Scraper):
+ """Scraper for naukri.com using headless browser (Playwright).
+
+ Uses browser automation to render CSR-based page and extract job listings.
+ """
+
+ def scrape(self, scraper_input: ScraperInput) -> JobResponse:
+ """Fetch job listings from Naukri.
+
+ Args:
+ scraper_input: Validated scraper config.
+
+ Returns:
+ JobResponse with collected JobPost objects.
+
+ Raises:
+ NaukriException: If browser launch fails or critical error occurs.
+ """
+ try:
+ from playwright.sync_api import sync_playwright
+ except ImportError:
+ raise NaukriException(
+ "Playwright required for Naukri scraper. "
+ "Install: pip install playwright && playwright install"
+ )
+
+ jobs: list[JobPost] = []
+ page = None
+ browser = None
+
+ try:
+ with sync_playwright() as p:
+ browser = p.chromium.launch(headless=True)
+ # Bypass headless detection
+ page = browser.new_page(
+ viewport={"width": 1920, "height": 1080},
+ user_agent=(
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/120.0.0.0 Safari/537.36"
+ ),
+ )
+
+ # Add headers to bypass detection
+ page.set_extra_http_headers({
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
+ "Accept-Language": "en-US,en;q=0.9",
+ "Sec-Fetch-Dest": "document",
+ "Sec-Fetch-Mode": "navigate",
+ "Sec-Fetch-Site": "none",
+ "Sec-Fetch-User": "?1",
+ })
+
+ page_no = 1
+ while len(jobs) < scraper_input.results_wanted:
+ try:
+ url = self._build_url(
+ scraper_input.search_term,
+ scraper_input.location,
+ page_no,
+ )
+ logger.info(f"Fetching page {page_no}: {url}")
+ page.goto(url, wait_until="networkidle", timeout=30000)
+ time.sleep(random.uniform(0.5, 1.5))
+
+ html = page.content()
+ job_cards = parse_search_html(html)
+
+ if not job_cards:
+ logger.info("No more jobs found")
+ break
+
+ for card in job_cards:
+ if len(jobs) >= scraper_input.results_wanted:
+ break
+
+ try:
+ job = self._card_to_jobpost(card)
+ jobs.append(job)
+ except Exception as e:
+ logger.warning(f"Failed to parse card {card.get('id')}: {e}")
+ continue
+
+ page_no += 1
+ time.sleep(random.uniform(0.5, 1.5))
+
+ except Exception as e:
+ logger.error(f"Error on page {page_no}: {e}")
+ break
+
+ except NaukriException:
+ raise
+ except Exception as e:
+ raise NaukriException(f"Browser error: {e}")
+ finally:
+ if page:
+ page.close()
+ if browser:
+ browser.close()
+
+ return JobResponse(jobs=jobs)
+
+ def _build_url(self, search_term: str, location: str | None, page: int) -> str:
+ """Build Naukri search URL."""
+ url = SEARCH_URL
+ params = [f"keyword={search_term}"]
+ if location:
+ params.append(f"location={location}")
+ params.append(f"pageNo={page}")
+ return f"{url}?{'&'.join(params)}"
+
+ def _card_to_jobpost(self, card: dict[str, Any]) -> JobPost:
+ """Convert job card dict to JobPost."""
+ return JobPost(
+ id=str(card.get("id", "")),
+ site=Site.NAUKRI,
+ job_url=card.get("job_url") or f"https://www.naukri.com/jobs/{card.get('id')}",
+ title=card.get("title") or "Unknown",
+ company=card.get("company"),
+ location=parse_location(card.get("location")),
+ compensation=parse_compensation(card.get("salary")),
+ job_type=[JOB_TYPE_MAP.get(card.get("job_type"), None)],
+ is_remote=None,
+ job_level=card.get("experience"),
+ description=None,
+ )
diff --git a/jobscraper/naukri/constant.py b/jobscraper/naukri/constant.py
new file mode 100644
index 0000000..023b1f7
--- /dev/null
+++ b/jobscraper/naukri/constant.py
@@ -0,0 +1,29 @@
+"""Constants for the Naukri scraper."""
+
+from __future__ import annotations
+
+from jobscraper.model import JobType
+
+BASE_URL = "https://www.naukri.com"
+SEARCH_URL = BASE_URL + "/search-results"
+
+NAUKRI_HEADERS: dict[str, str] = {
+ "User-Agent": (
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
+ "Chrome/120.0.0.0 Safari/537.36"
+ ),
+ "Accept": (
+ "text/html,application/xhtml+xml,application/xml;"
+ "q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
+ ),
+ "Accept-Language": "en-US,en;q=0.9",
+ "Referer": BASE_URL + "/",
+}
+
+JOB_TYPE_MAP: dict[str, JobType] = {
+ "Full Time": JobType.FULL_TIME,
+ "Part Time": JobType.PART_TIME,
+ "Contract": JobType.CONTRACT,
+ "Temporary": JobType.TEMPORARY,
+}
diff --git a/jobscraper/naukri/util.py b/jobscraper/naukri/util.py
new file mode 100644
index 0000000..bedd192
--- /dev/null
+++ b/jobscraper/naukri/util.py
@@ -0,0 +1,146 @@
+"""Utility functions for the Naukri scraper."""
+
+from __future__ import annotations
+
+import re
+from bs4 import BeautifulSoup
+
+from jobscraper.model import Compensation, CompensationInterval, Location
+
+
+def parse_location(raw: str | None) -> Location:
+ """Parse Naukri location string to Location model.
+
+ All results India-based.
+ Handles: 'Bangalore', 'Bangalore, Karnataka', 'Remote', 'Work from Home'.
+ """
+ if not raw or not raw.strip():
+ return Location(country="India")
+
+ raw = raw.strip()
+
+ if raw.lower() in ("remote", "work from home", "pan india", "wfh"):
+ return Location(country="India")
+
+ parts = [p.strip() for p in raw.split(",")]
+ if len(parts) == 1:
+ return Location(city=parts[0], country="India")
+ elif len(parts) == 2:
+ return Location(city=parts[0], state=parts[1], country="India")
+ else:
+ return Location(city=parts[0], country="India")
+
+
+def parse_compensation(raw: str | None) -> Compensation | None:
+ """Parse compensation string from Naukri card.
+
+ Handles:
+ - '₹4 - 7 LPA' → yearly INR (multiply by 100,000)
+ - '₹15,000 - ₹20,000 /month' → monthly INR
+ - '₹500 /day' → daily INR
+ - 'Not disclosed' → None
+ """
+ if not raw or not raw.strip():
+ return None
+
+ raw = raw.strip()
+
+ if any(phrase in raw.lower() for phrase in ["not disclosed", "confidential", "as per"]):
+ return None
+
+ numbers = re.findall(r"[\d,]+(?:\.\d+)?", raw)
+ if not numbers:
+ return None
+
+ try:
+ values = [float(n.replace(",", "")) for n in numbers]
+ except ValueError:
+ return None
+
+ min_val = None
+ max_val = None
+ interval = CompensationInterval.YEARLY
+
+ if len(values) >= 2:
+ min_val, max_val = values[0], values[1]
+ elif len(values) == 1:
+ min_val = values[0]
+
+ if "/month" in raw.lower() or "per month" in raw.lower():
+ interval = CompensationInterval.MONTHLY
+ elif "/day" in raw.lower() or "per day" in raw.lower():
+ interval = CompensationInterval.DAILY
+ elif "/hour" in raw.lower() or "per hour" in raw.lower():
+ interval = CompensationInterval.HOURLY
+ elif "LPA" in raw.upper():
+ interval = CompensationInterval.YEARLY
+ if min_val:
+ min_val *= 100_000
+ if max_val:
+ max_val *= 100_000
+ else:
+ if min_val and min_val >= 100_000:
+ interval = CompensationInterval.YEARLY
+ else:
+ interval = CompensationInterval.MONTHLY
+
+ return Compensation(
+ min_amount=min_val,
+ max_amount=max_val,
+ interval=interval,
+ currency="INR",
+ )
+
+
+def parse_search_html(html: str) -> list[dict]:
+ """Extract job postings from Naukri search results page."""
+ soup = BeautifulSoup(html, "lxml")
+ jobs = []
+
+ for job_card in soup.find_all("article", class_="jobTuple"):
+ try:
+ job_id = job_card.get("data-jobid") or job_card.get("id")
+ if not job_id:
+ continue
+
+ title_elem = job_card.find("a", class_="jobTitle")
+ title = title_elem.get_text(strip=True) if title_elem else None
+
+ company_elem = job_card.find("a", class_="companyName")
+ company = company_elem.get_text(strip=True) if company_elem else None
+
+ loc_elem = job_card.find("span", class_="locWc")
+ location = loc_elem.get_text(strip=True) if loc_elem else None
+
+ exp_elem = job_card.find("span", class_="exp")
+ experience = exp_elem.get_text(strip=True) if exp_elem else None
+
+ salary_elem = job_card.find("span", class_="sal")
+ salary = salary_elem.get_text(strip=True) if salary_elem else None
+
+ job_type_elem = job_card.find("span", class_="jobType")
+ job_type = job_type_elem.get_text(strip=True) if job_type_elem else None
+
+ job_url = None
+ link_elem = job_card.find("a", {"href": True})
+ if link_elem:
+ href = link_elem.get("href", "")
+ if href and not href.startswith("http"):
+ job_url = "https://www.naukri.com" + href
+ else:
+ job_url = href
+
+ jobs.append({
+ "id": job_id,
+ "title": title,
+ "company": company,
+ "location": location,
+ "experience": experience,
+ "salary": salary,
+ "job_type": job_type,
+ "job_url": job_url,
+ })
+ except Exception:
+ continue
+
+ return jobs
diff --git a/pyproject.toml b/pyproject.toml
index 0dc3b22..4f81a5f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -31,9 +31,13 @@ dependencies = [
]
[project.optional-dependencies]
+browser = [
+ "playwright>=1.40.0",
+]
test = [
"pytest>=7.4.0",
"pytest-cov>=4.1.0",
+ "playwright>=1.40.0",
]
lint = [
"black>=24.1.0",
@@ -48,6 +52,7 @@ dev = [
"pre-commit>=3.5.0",
"pytest>=7.4.0",
"pytest-cov>=4.1.0",
+ "playwright>=1.40.0",
]
[build-system]
diff --git a/tests/integration/test_naukri_integration.py b/tests/integration/test_naukri_integration.py
new file mode 100644
index 0000000..089bd71
--- /dev/null
+++ b/tests/integration/test_naukri_integration.py
@@ -0,0 +1,41 @@
+"""Integration tests for Naukri scraper (live site).
+
+Note: Naukri.com uses Akamai WAF which blocks headless browser automation.
+These tests are expected to fail unless run from a residential IP or with proxy.
+"""
+
+import pytest
+from jobscraper.model import ScraperInput, Site
+from jobscraper.naukri import NaukriScraper
+
+
+@pytest.mark.integration
+@pytest.mark.xfail(strict=False, reason="Naukri WAF blocks headless browser (Akamai)")
+def test_naukri_scraper_live_search():
+ """Test live Naukri search."""
+ scraper = NaukriScraper()
+ input_data = ScraperInput(
+ site_name=[Site.NAUKRI],
+ search_term="python",
+ location="Bangalore",
+ results_wanted=5,
+ )
+ response = scraper.scrape(input_data)
+ assert response.jobs is not None
+ # This assertion will fail due to WAF
+ assert len(response.jobs) > 0
+
+
+@pytest.mark.integration
+@pytest.mark.xfail(strict=False, reason="Naukri WAF blocks headless browser")
+def test_naukri_scraper_remote_jobs():
+ """Test Naukri remote job search."""
+ scraper = NaukriScraper()
+ input_data = ScraperInput(
+ site_name=[Site.NAUKRI],
+ search_term="data scientist",
+ location="Remote",
+ results_wanted=5,
+ )
+ response = scraper.scrape(input_data)
+ assert response.jobs is not None
diff --git a/tests/test_naukri.py b/tests/test_naukri.py
new file mode 100644
index 0000000..8ef1448
--- /dev/null
+++ b/tests/test_naukri.py
@@ -0,0 +1,103 @@
+"""Unit tests for Naukri scraper parsing utilities."""
+
+import pytest
+from jobscraper.naukri.util import parse_location, parse_compensation, parse_search_html
+from jobscraper.model import CompensationInterval
+
+
+class TestParseLocation:
+ """Test location parsing."""
+
+ def test_city_only(self):
+ result = parse_location("Bangalore")
+ assert result.city == "Bangalore"
+ assert result.country == "India"
+
+ def test_city_state(self):
+ result = parse_location("Bangalore, Karnataka")
+ assert result.city == "Bangalore"
+ assert result.state == "Karnataka"
+ assert result.country == "India"
+
+ def test_remote_variations(self):
+ for remote_text in ["Remote", "Work from Home", "WFH", "Pan India"]:
+ result = parse_location(remote_text)
+ assert result.country == "India"
+ assert result.city is None
+
+ def test_empty(self):
+ result = parse_location(None)
+ assert result.country == "India"
+ assert result.city is None
+
+
+class TestParseCompensation:
+ """Test compensation parsing."""
+
+ def test_lpa_range(self):
+ result = parse_compensation("₹4 - 7 LPA")
+ assert result is not None
+ assert result.min_amount == 400_000
+ assert result.max_amount == 700_000
+ assert result.interval == CompensationInterval.YEARLY
+ assert result.currency == "INR"
+
+ def test_monthly_range(self):
+ result = parse_compensation("₹15,000 - ₹20,000 /month")
+ assert result is not None
+ assert result.min_amount == 15_000
+ assert result.max_amount == 20_000
+ assert result.interval == CompensationInterval.MONTHLY
+
+ def test_daily_rate(self):
+ result = parse_compensation("₹500 /day")
+ assert result is not None
+ assert result.min_amount == 500
+ assert result.interval == CompensationInterval.DAILY
+
+ def test_not_disclosed(self):
+ result = parse_compensation("Not disclosed")
+ assert result is None
+
+ def test_empty(self):
+ result = parse_compensation(None)
+ assert result is None
+
+
+class TestParseSearchHTML:
+ """Test HTML parsing with mock Naukri HTML."""
+
+ def test_parse_single_job_card(self):
+ html = """
+
+
+ Python Developer
+ Tech Corp
+ Bangalore
+ 3 - 5 years
+ ₹8 - 12 LPA
+ Full Time
+
+
+ """
+ jobs = parse_search_html(html)
+ assert len(jobs) == 1
+ assert jobs[0]["id"] == "12345"
+ assert jobs[0]["title"] == "Python Developer"
+ assert jobs[0]["company"] == "Tech Corp"
+ assert jobs[0]["location"] == "Bangalore"
+
+ def test_parse_multiple_cards(self):
+ html = """
+
+ Job 1
+ Job 2
+
+ """
+ jobs = parse_search_html(html)
+ assert len(jobs) == 2
+
+ def test_parse_empty_html(self):
+ html = ""
+ jobs = parse_search_html(html)
+ assert len(jobs) == 0