Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions jobscraper/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,11 +36,13 @@
from jobscraper.glassdoor import GlassdoorScraper # noqa: E402
from jobscraper.indeed import IndeedScraper # noqa: E402
from jobscraper.linkedin import LinkedInScraper # noqa: E402
from jobscraper.naukri import NaukriScraper # noqa: E402

SCRAPER_MAPPING: dict[Site, type] = {
Site.INDEED: IndeedScraper,
Site.GLASSDOOR: GlassdoorScraper,
Site.LINKEDIN: LinkedInScraper,
Site.NAUKRI: NaukriScraper,
}


Expand Down
20 changes: 19 additions & 1 deletion jobscraper/exception.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,22 @@ def __init__(self, message: str | None = None):
super().__init__(message or "An error occurred with LinkedIn")


# class NaukriException(Exception): pass # planned
class UpworkException(Exception):
"""Raised when the Upwork scraper encounters an unrecoverable error."""

def __init__(self, message: str | None = None):
super().__init__(message or "An error occurred with Upwork")


class InternshalaException(Exception):
"""Raised when the Internshala scraper encounters an unrecoverable error."""

def __init__(self, message: str | None = None):
super().__init__(message or "An error occurred with Internshala")


class NaukriException(Exception):
"""Raised when the Naukri scraper encounters an unrecoverable error."""

def __init__(self, message: str | None = None):
super().__init__(message or "An error occurred with Naukri")
7 changes: 4 additions & 3 deletions jobscraper/model.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,11 +20,12 @@ class Site(str, Enum):
INDEED = "indeed"
GLASSDOOR = "glassdoor"
LINKEDIN = "linkedin"
# NAUKRI = "naukri" # planned
UPWORK = "upwork"
INTERNSHALA_JOBS = "internshala_jobs"
INTERNSHALA_INTERNSHIPS = "internshala_internships"
NAUKRI = "naukri"
# FOUNDIT = "foundit" # planned
# SHINE = "shine" # planned
# INTERNSHALA = "internshala" # planned
# UPWORK = "upwork" # planned
# APNA = "apna" # planned


Expand Down
7 changes: 7 additions & 0 deletions jobscraper/naukri/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
"""Naukri job scraper module."""

from __future__ import annotations

from jobscraper.naukri._scraper import NaukriScraper

__all__ = ["NaukriScraper"]
148 changes: 148 additions & 0 deletions jobscraper/naukri/_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
"""Naukri scraper implementation using headless browser."""

from __future__ import annotations

import random
import time
from datetime import date
from typing import Any

from jobscraper.exception import NaukriException
from jobscraper.model import JobPost, JobResponse, Scraper, ScraperInput, Site
from jobscraper.naukri.constant import NAUKRI_HEADERS, JOB_TYPE_MAP, SEARCH_URL
from jobscraper.naukri.util import (
parse_compensation,
parse_location,
parse_search_html,
)
from jobscraper.util import create_logger, get_enum_from_job_type, markdown_converter

logger = create_logger("naukri")


class NaukriScraper(Scraper):
"""Scraper for naukri.com using headless browser (Playwright).

Uses browser automation to render CSR-based page and extract job listings.
"""

def scrape(self, scraper_input: ScraperInput) -> JobResponse:
"""Fetch job listings from Naukri.

Args:
scraper_input: Validated scraper config.

Returns:
JobResponse with collected JobPost objects.

Raises:
NaukriException: If browser launch fails or critical error occurs.
"""
try:
from playwright.sync_api import sync_playwright
except ImportError:
raise NaukriException(
"Playwright required for Naukri scraper. "
"Install: pip install playwright && playwright install"
)

jobs: list[JobPost] = []
page = None
browser = None

try:
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
# Bypass headless detection
page = browser.new_page(
viewport={"width": 1920, "height": 1080},
user_agent=(
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
)

# Add headers to bypass detection
page.set_extra_http_headers({
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8",
"Accept-Language": "en-US,en;q=0.9",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
})

page_no = 1
while len(jobs) < scraper_input.results_wanted:
try:
url = self._build_url(
scraper_input.search_term,
scraper_input.location,
page_no,
)
logger.info(f"Fetching page {page_no}: {url}")
page.goto(url, wait_until="networkidle", timeout=30000)
time.sleep(random.uniform(0.5, 1.5))

html = page.content()
job_cards = parse_search_html(html)

if not job_cards:
logger.info("No more jobs found")
break

for card in job_cards:
if len(jobs) >= scraper_input.results_wanted:
break

try:
job = self._card_to_jobpost(card)
jobs.append(job)
except Exception as e:
logger.warning(f"Failed to parse card {card.get('id')}: {e}")
continue

page_no += 1
time.sleep(random.uniform(0.5, 1.5))

except Exception as e:
logger.error(f"Error on page {page_no}: {e}")
break

except NaukriException:
raise
except Exception as e:
raise NaukriException(f"Browser error: {e}")
finally:
if page:
page.close()
if browser:
browser.close()

return JobResponse(jobs=jobs)

def _build_url(self, search_term: str, location: str | None, page: int) -> str:
"""Build Naukri search URL."""
url = SEARCH_URL
params = [f"keyword={search_term}"]
if location:
params.append(f"location={location}")
params.append(f"pageNo={page}")
return f"{url}?{'&'.join(params)}"

def _card_to_jobpost(self, card: dict[str, Any]) -> JobPost:
"""Convert job card dict to JobPost."""
return JobPost(
id=str(card.get("id", "")),
site=Site.NAUKRI,
job_url=card.get("job_url") or f"https://www.naukri.com/jobs/{card.get('id')}",
title=card.get("title") or "Unknown",
company=card.get("company"),
location=parse_location(card.get("location")),
compensation=parse_compensation(card.get("salary")),
job_type=[JOB_TYPE_MAP.get(card.get("job_type"), None)],
is_remote=None,
job_level=card.get("experience"),
description=None,
)
29 changes: 29 additions & 0 deletions jobscraper/naukri/constant.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""Constants for the Naukri scraper."""

from __future__ import annotations

from jobscraper.model import JobType

BASE_URL = "https://www.naukri.com"
SEARCH_URL = BASE_URL + "/search-results"

NAUKRI_HEADERS: dict[str, str] = {
"User-Agent": (
"Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) "
"Chrome/120.0.0.0 Safari/537.36"
),
"Accept": (
"text/html,application/xhtml+xml,application/xml;"
"q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8"
),
"Accept-Language": "en-US,en;q=0.9",
"Referer": BASE_URL + "/",
}

JOB_TYPE_MAP: dict[str, JobType] = {
"Full Time": JobType.FULL_TIME,
"Part Time": JobType.PART_TIME,
"Contract": JobType.CONTRACT,
"Temporary": JobType.TEMPORARY,
}
146 changes: 146 additions & 0 deletions jobscraper/naukri/util.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
"""Utility functions for the Naukri scraper."""

from __future__ import annotations

import re
from bs4 import BeautifulSoup

from jobscraper.model import Compensation, CompensationInterval, Location


def parse_location(raw: str | None) -> Location:
"""Parse Naukri location string to Location model.

All results India-based.
Handles: 'Bangalore', 'Bangalore, Karnataka', 'Remote', 'Work from Home'.
"""
if not raw or not raw.strip():
return Location(country="India")

raw = raw.strip()

if raw.lower() in ("remote", "work from home", "pan india", "wfh"):
return Location(country="India")

parts = [p.strip() for p in raw.split(",")]
if len(parts) == 1:
return Location(city=parts[0], country="India")
elif len(parts) == 2:
return Location(city=parts[0], state=parts[1], country="India")
else:
return Location(city=parts[0], country="India")


def parse_compensation(raw: str | None) -> Compensation | None:
"""Parse compensation string from Naukri card.

Handles:
- '₹4 - 7 LPA' → yearly INR (multiply by 100,000)
- '₹15,000 - ₹20,000 /month' → monthly INR
- '₹500 /day' → daily INR
- 'Not disclosed' → None
"""
if not raw or not raw.strip():
return None

raw = raw.strip()

if any(phrase in raw.lower() for phrase in ["not disclosed", "confidential", "as per"]):
return None

numbers = re.findall(r"[\d,]+(?:\.\d+)?", raw)
if not numbers:
return None

try:
values = [float(n.replace(",", "")) for n in numbers]
except ValueError:
return None

min_val = None
max_val = None
interval = CompensationInterval.YEARLY

if len(values) >= 2:
min_val, max_val = values[0], values[1]
elif len(values) == 1:
min_val = values[0]

if "/month" in raw.lower() or "per month" in raw.lower():
interval = CompensationInterval.MONTHLY
elif "/day" in raw.lower() or "per day" in raw.lower():
interval = CompensationInterval.DAILY
elif "/hour" in raw.lower() or "per hour" in raw.lower():
interval = CompensationInterval.HOURLY
elif "LPA" in raw.upper():
interval = CompensationInterval.YEARLY
if min_val:
min_val *= 100_000
if max_val:
max_val *= 100_000
else:
if min_val and min_val >= 100_000:
interval = CompensationInterval.YEARLY
else:
interval = CompensationInterval.MONTHLY

return Compensation(
min_amount=min_val,
max_amount=max_val,
interval=interval,
currency="INR",
)


def parse_search_html(html: str) -> list[dict]:
"""Extract job postings from Naukri search results page."""
soup = BeautifulSoup(html, "lxml")
jobs = []

for job_card in soup.find_all("article", class_="jobTuple"):
try:
job_id = job_card.get("data-jobid") or job_card.get("id")
if not job_id:
continue

title_elem = job_card.find("a", class_="jobTitle")
title = title_elem.get_text(strip=True) if title_elem else None

company_elem = job_card.find("a", class_="companyName")
company = company_elem.get_text(strip=True) if company_elem else None

loc_elem = job_card.find("span", class_="locWc")
location = loc_elem.get_text(strip=True) if loc_elem else None

exp_elem = job_card.find("span", class_="exp")
experience = exp_elem.get_text(strip=True) if exp_elem else None

salary_elem = job_card.find("span", class_="sal")
salary = salary_elem.get_text(strip=True) if salary_elem else None

job_type_elem = job_card.find("span", class_="jobType")
job_type = job_type_elem.get_text(strip=True) if job_type_elem else None

job_url = None
link_elem = job_card.find("a", {"href": True})
if link_elem:
href = link_elem.get("href", "")
if href and not href.startswith("http"):
job_url = "https://www.naukri.com" + href
else:
job_url = href

jobs.append({
"id": job_id,
"title": title,
"company": company,
"location": location,
"experience": experience,
"salary": salary,
"job_type": job_type,
"job_url": job_url,
})
except Exception:
continue

return jobs
Loading