diff --git a/DocSummarization/.gitignore b/DocSummarization/.gitignore new file mode 100644 index 0000000000..409de7af9c --- /dev/null +++ b/DocSummarization/.gitignore @@ -0,0 +1,58 @@ +# Environment variables and secrets +.env +.env.local +.env.*.local +*.env + +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +venv/ +ENV/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# IDEs +.vscode/ +.idea/ +*.swp +*.swo +*~ +.DS_Store + +# Logs +*.log +logs/ + +# Testing +.pytest_cache/ +.coverage +htmlcov/ + +# Node modules +node_modules/ + +# Docker +*.pid + +# Temporary files +*.tmp +*.bak +nul diff --git a/DocSummarization/README.MD b/DocSummarization/README.MD new file mode 100644 index 0000000000..f9486ddcbf --- /dev/null +++ b/DocSummarization/README.MD @@ -0,0 +1,262 @@ +## Document Summarization + +A full-stack document summarization application that processes text and document files to generate concise summaries. +The system integrates a FastAPI backend with enterprise inference endpoints, alongside a modern React + Vite + Tailwind CSS frontend for an intuitive user experience. + +## Table of Contents + +- [Project Overview](#project-overview) +- [Features](#features) +- [Architecture](#architecture) +- [Prerequisites](#prerequisites) +- [Quick Start Deployment](#quick-start-deployment) +- [User Interface](#user-interface) +- [Troubleshooting](#troubleshooting) + +--- + +## Project Overview + +The **Document Summarization** application processes multiple content formats to generate concise summaries. Users can paste text or upload documents (PDF, DOCX). The backend uses enterprise inference endpoints via Keycloak authentication for all text summarization. + +--- + +## Features + +**Backend** + +- Multiple input format support (text, PDF, DOCX) +- PDF text extraction with OCR support for image-based PDFs +- DOCX document processing +- Enterprise inference endpoints for text summarization via Keycloak +- File validation and size limits (PDF/DOCX: 50 MB) +- CORS enabled for web integration +- Comprehensive error handling and logging +- Health check endpoints +- Modular architecture (routes + services) + +**Frontend** + +- Clean, intuitive interface with tab-based input selection +- Drag-and-drop file upload +- Real-time summary display +- Mobile-responsive design with Tailwind CSS +- Built with Vite for fast development + +--- + +## Architecture + +Below is the architecture showing how user input is processed through document extraction, then summarized using the enterprise inference endpoint. + +```mermaid +graph TB + A[React Web UI
Port 5173] -->|User Input| B[FastAPI Backend
Port 8000] + + B --> C{Input Type} + C -->|Text| D[LLM Service] + C -->|PDF/DOCX| E[PDF Service] + + E -->|Extracted Text| D + + D -->|Get Token| F[API Client] + F -->|Authenticate| G[Keycloak] + G -->|Access Token| F + F -->|Token| D + D -->|API Call with Token| H[Enterprise Inference
Llama-3.1-8B-Instruct] + H -->|Summary| B + B -->|JSON Response| A + + style A fill:#e1f5ff + style B fill:#fff4e1 + style D fill:#ffe1f5 + style E fill:#ffe1f5 + style F fill:#fffacd + style G fill:#ffcccc + style H fill:#e1ffe1 +``` + +**Service Components:** + +1. **React Web UI (Port 5173)** - Provides intuitive interface with drag-and-drop file upload, tab-based input selection, and real-time summary display + +2. **FastAPI Backend (Port 8000)** - Orchestrates document processing, handles authentication, and routes requests to appropriate processing services + +**Typical Flow:** + +1. User inputs text or uploads a document (PDF/DOCX) through the web UI. +2. The backend processes the input: + - Text: Sent directly to LLM service + - PDF/DOCX: Extracted using PDF service with OCR support +3. The LLM service requests a token from API Client. +4. API Client authenticates with Keycloak and obtains an access token. +5. LLM service uses the token to call the enterprise inference endpoint. +6. The model generates a summary using Llama-3.1-8B-Instruct. +7. The summary is returned and displayed to the user via the UI. + +--- + +## Prerequisites + +### System Requirements + +Before you begin, ensure you have the following installed: + +- **Docker and Docker Compose** +- **Enterprise inference endpoint access** (Keycloak authentication) + +### Verify Docker Installation + +```bash +# Check Docker version +docker --version + +# Check Docker Compose version +docker compose version + +# Verify Docker is running +docker ps +``` + +--- + +## Quick Start Deployment + +### Clone the Repository + +```bash +git clone https://github.com/opea-project/GenAIExamples.git +cd GenAIExamples/DocSummarization +``` + +### Set up the Environment + +This application requires an `.env` file in the `backend` directory for proper configuration. Create it with the commands below: + +```bash +# Create the .env file in the backend directory +mkdir -p backend +cat > backend/.env << EOF +# Enterprise/Keycloak Configuration (REQUIRED) +BASE_URL=https://api.example.com +KEYCLOAK_REALM=master +KEYCLOAK_CLIENT_ID=api +KEYCLOAK_CLIENT_SECRET=your_client_secret + +# Model Configuration (Enterprise Inference) +INFERENCE_MODEL_ENDPOINT=Llama-3.1-8B-Instruct +INFERENCE_MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct + +# LLM Configuration +LLM_TEMPERATURE=0.7 +LLM_MAX_TOKENS=2000 + +# Service Configuration +SERVICE_PORT=8000 +LOG_LEVEL=INFO + +# CORS Settings +CORS_ORIGINS=* +EOF +``` + +Or manually create `backend/.env` with: + +```bash +# Enterprise/Keycloak Configuration (REQUIRED) +BASE_URL=https://api.example.com +KEYCLOAK_REALM=master +KEYCLOAK_CLIENT_ID=api +KEYCLOAK_CLIENT_SECRET=your_client_secret + +# Model Configuration (Enterprise Inference) +INFERENCE_MODEL_ENDPOINT=Llama-3.1-8B-Instruct +INFERENCE_MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct + +# LLM Configuration +LLM_TEMPERATURE=0.7 +LLM_MAX_TOKENS=2000 + +# Service Configuration +SERVICE_PORT=8000 +LOG_LEVEL=INFO + +# CORS Settings +CORS_ORIGINS=* +``` + + +### Running the Application + +Start both API and UI services together with Docker Compose: + +```bash +# From the DocSummarization directory +docker compose up --build + +# Or run in detached mode (background) +docker compose up -d --build +``` + +The Backend will be available at: `http://localhost:8000` +The UI will be available at: `http://localhost:5173` + +**View logs**: + +```bash +# All services +docker compose logs -f + +# Backend only +docker compose logs -f backend + +# Frontend only +docker compose logs -f frontend +``` + +**Verify the services are running**: + +```bash +# Check API health +curl http://localhost:8000/health + +# Check if containers are running +docker compose ps +``` + +--- + + +## User Interface + +**Using the Application** + +Make sure you are at the localhost:5173 url + +You will be directed to the main page which has each feature + +![Home Page - Hero Section](./assets/img/homepage.png) + +![Home Page - Hero Section](./assets/img/ui.png) + + + +**UI Configuration** + +When running with Docker Compose, the UI automatically connects to the backend API. The frontend is available at `http://localhost:5173` and the API at `http://localhost:8000`. + + +### Stopping the Application + +```bash +docker compose down +``` + +--- + +## Troubleshooting + +For comprehensive troubleshooting guidance, common issues, and solutions, refer to: + +[TROUBLESHOOTING.md](./TROUBLESHOOTING.md) + diff --git a/DocSummarization/TROUBLESHOOTING.md b/DocSummarization/TROUBLESHOOTING.md new file mode 100644 index 0000000000..789e18dfe0 --- /dev/null +++ b/DocSummarization/TROUBLESHOOTING.md @@ -0,0 +1,320 @@ +# Troubleshooting Guide + +## Common Issues + +### 1. Containers Not Starting + +**Symptom**: Containers fail to start or exit immediately + +**Check container status:** +```bash +docker compose ps +``` + +**View error logs:** +```bash +docker compose logs backend +docker compose logs frontend +``` + +**Solution:** +```bash +# Rebuild containers +docker compose down +docker compose up -d --build +``` + +### 2. Backend Connection Errors + +**Symptom**: Frontend shows "Failed to connect" or network errors + +**Check backend health:** +```bash +curl http://localhost:8000/health +``` + +**Expected response:** +```json +{"status":"healthy","service":"Document Summarization Service","version":"2.0.0","llm_provider":"Enterprise Inference (Keycloak)"} +``` + +**Solution:** +- Verify backend container is running: `docker compose ps` +- Check backend logs: `docker compose logs backend -f` +- Restart backend: `docker compose restart backend` + +### 3. Keycloak Authentication Errors + +**Symptom**: Text/PDF summarization fails with authentication errors + +**Error**: `Authentication error` or `Failed to resolve 'api.example.com'` + +**Solution:** +- Check Keycloak credentials in `backend/.env`: + - `BASE_URL` (enterprise inference endpoint) + - `KEYCLOAK_REALM` + - `KEYCLOAK_CLIENT_ID` + - `KEYCLOAK_CLIENT_SECRET` +- Verify enterprise inference endpoint is accessible +- Test authentication: +```bash +curl -X POST https://your-api.example.com/token \ + -d "grant_type=client_credentials" \ + -d "client_id=your-client-id" \ + -d "client_secret=your-client-secret" +``` + +**Error**: `Connection timeout` + +**Solution:** +- Verify `BASE_URL` is correct in `backend/.env` +- Check network connectivity to enterprise endpoint +- Verify firewall settings allow HTTPS connections + +### 4. PDF Processing Errors + +**Symptom**: PDF upload fails or returns empty text + +**Error**: `Failed to extract text from PDF` + +**Causes:** +- Scanned PDF without text layer (image-only PDF) +- Password-protected PDF +- Corrupted PDF file +- PDF with complex formatting + +**Solution:** +- For scanned PDFs, ensure OCR was run during scanning +- Remove password protection before uploading +- Try re-saving PDF with Adobe Reader or similar tool +- Check backend logs for specific error details +- Maximum PDF size: 50MB + +### 5. Frontend Not Loading + +**Symptom**: Browser shows blank page or cannot connect to localhost:5173 + +**Check frontend status:** +```bash +docker compose ps frontend +``` + +**Check frontend logs:** +```bash +docker compose logs frontend -f +``` + +**Solution:** +- Clear browser cache and hard refresh (Ctrl+F5) +- Verify port 5173 is not in use: `netstat -ano | findstr :5173` (Windows) +- Kill conflicting process if port is occupied +- Restart frontend: `docker compose restart frontend` +- Check firewall settings allow localhost:5173 + +### 6. Port Already in Use + +**Error**: `Port 5173 is already allocated` or `Port 8000 is already allocated` + +**Find process using port:** +```bash +# Windows +netstat -ano | findstr :5173 +netstat -ano | findstr :8000 + +# Linux/Mac +lsof -i :5173 +lsof -i :8000 +``` + +**Solution:** +- Stop the conflicting process +- Or change ports in `docker-compose.yml`: + ```yaml + ports: + - "8001:8000" # Change 8000 to 8001 + - "5174:80" # Change 5173 to 5174 + ``` + +### 7. Out of Memory Errors + +**Symptom**: Container crashes or backend becomes unresponsive + +**Check logs:** +```bash +docker compose logs backend | grep -i "memory\|killed" +``` + +**Solution:** +- Reduce file sizes (use smaller PDFs) +- Reduce `max_tokens` in LLM requests +- Increase Docker memory limit in Docker Desktop settings (minimum 4GB recommended) +- Process one file at a time instead of multiple concurrent requests + +### 8. Backend Service Unavailable + +**Symptom**: 502 Bad Gateway or 503 Service Unavailable + +**Check backend:** +```bash +docker compose logs backend --tail=50 +``` + +**Common causes:** +- Backend still starting (wait 30-60 seconds after start) +- Configuration error in `.env` file +- Enterprise inference endpoint unreachable +- Keycloak authentication failing +- Python dependency issues + +**Solution:** +```bash +# Restart backend +docker compose restart backend + +# If issues persist, rebuild +docker compose down +docker compose up -d --build backend +``` + +## Configuration Issues + +### Invalid .env Configuration + +**Symptom**: Backend fails to start with configuration errors + +**Check required variables in `backend/.env`:** + +**For text/PDF/DOCX summarization:** +```bash +BASE_URL=https://api.example.com +KEYCLOAK_REALM=master +KEYCLOAK_CLIENT_ID=api +KEYCLOAK_CLIENT_SECRET=your_client_secret +INFERENCE_MODEL_ENDPOINT=Llama-3.1-8B-Instruct +INFERENCE_MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct +``` + +**Common mistakes:** +- Missing required Keycloak variables +- Extra spaces in variable names +- Wrong endpoint format (missing https://) +- Quotes around values (not needed in .env files) +- Using placeholder values like "api.example.com" or "your_client_secret" + +### File Size Limits + +**Maximum file sizes:** +- PDF/DOCX documents: 50 MB + +**Configured in `backend/config.py`:** +```python +MAX_PDF_SIZE = 52428800 # 50MB in bytes +``` + +## Advanced Troubleshooting + +### Enable Debug Logging + +Edit `backend/.env`: +```bash +LOG_LEVEL=DEBUG +``` + +Restart backend: +```bash +docker compose restart backend +docker compose logs backend -f +``` + +### Test Backend Directly + +**Test text summarization:** +```bash +curl -X POST http://localhost:8000/v1/docsum \ + -F "type=text" \ + -F "messages=This is a test document about artificial intelligence and machine learning." \ + -F "max_tokens=100" \ + -F "stream=false" +``` + +**Test PDF summarization:** +```bash +curl -X POST http://localhost:8000/v1/docsum \ + -F "type=text" \ + -F "files=@test.pdf" \ + -F "max_tokens=100" \ + -F "stream=false" +``` + + +### Inspect Container + +**Access backend container shell:** +```bash +docker compose exec backend /bin/bash +``` + +**Check Python environment:** +```bash +docker compose exec backend pip list +docker compose exec backend python -c "import pypdf; print('pypdf installed')" +``` + +**Verify environment variables:** +```bash +docker compose exec backend env | grep -E "BASE_URL|KEYCLOAK" +``` + +### Clean Docker Environment + +If issues persist, clean Docker completely: + +```bash +# Stop and remove containers +docker compose down -v + +# Remove unused images +docker system prune -a + +# Rebuild from scratch +docker compose up -d --build +``` + +## Architecture-Specific Issues + +### Enterprise Inference Connection + +**Symptom**: All summarization fails (text, PDF) + +**Required for**: ALL summarization operations + +**Check configuration:** +1. Verify `BASE_URL` points to your enterprise inference endpoint +2. Confirm Keycloak credentials are correct +3. Test Keycloak authentication separately +4. Verify network access to enterprise endpoint +5. Check if model name matches available models + +## Getting Help + +If issues persist after following this guide: + +1. **Collect Information:** + - Docker logs: `docker compose logs > logs.txt` + - Docker status: `docker compose ps` + - Environment check: `docker compose config` + +2. **Check Configuration:** + - Review `backend/.env` file (remove sensitive data before sharing) + - Verify Keycloak credentials with your admin + +3. **Try Minimal Setup:** + - Start with text summarization (simple, no files) + - Then test PDF processing + - This helps isolate which component is failing + +4. **System Information:** + - Docker version: `docker --version` + - Docker Compose version: `docker compose version` + - Operating system and version + - Available memory and disk space diff --git a/DocSummarization/assets/img/docsum-homepage1.png b/DocSummarization/assets/img/docsum-homepage1.png new file mode 100644 index 0000000000..af95b1a30a Binary files /dev/null and b/DocSummarization/assets/img/docsum-homepage1.png differ diff --git a/DocSummarization/assets/img/docsum-homepage2.png b/DocSummarization/assets/img/docsum-homepage2.png new file mode 100644 index 0000000000..f556397dbc Binary files /dev/null and b/DocSummarization/assets/img/docsum-homepage2.png differ diff --git a/DocSummarization/assets/img/homepage.png b/DocSummarization/assets/img/homepage.png new file mode 100644 index 0000000000..b2c8c07c6f Binary files /dev/null and b/DocSummarization/assets/img/homepage.png differ diff --git a/DocSummarization/assets/img/ui.png b/DocSummarization/assets/img/ui.png new file mode 100644 index 0000000000..39da31a1d4 Binary files /dev/null and b/DocSummarization/assets/img/ui.png differ diff --git a/DocSummarization/backend/.env.example b/DocSummarization/backend/.env.example new file mode 100644 index 0000000000..e09ae8a97c --- /dev/null +++ b/DocSummarization/backend/.env.example @@ -0,0 +1,25 @@ +# Enterprise/Keycloak Configuration (REQUIRED) +BASE_URL=https://api.example.com +KEYCLOAK_REALM=master +KEYCLOAK_CLIENT_ID=api +KEYCLOAK_CLIENT_SECRET=your_client_secret + +# Model Configuration (Enterprise Inference) +INFERENCE_MODEL_ENDPOINT=Llama-3.1-8B-Instruct +INFERENCE_MODEL_NAME=meta-llama/Llama-3.1-8B-Instruct + +# LLM Configuration +LLM_TEMPERATURE=0.7 +LLM_MAX_TOKENS=2000 + +# Service Configuration +SERVICE_PORT=8000 +LOG_LEVEL=INFO + +# File Upload Limits +MAX_FILE_SIZE=524288000 +MAX_PDF_SIZE=52428800 +MAX_PDF_PAGES=100 + +# CORS Settings +CORS_ORIGINS=* diff --git a/DocSummarization/backend/Dockerfile b/DocSummarization/backend/Dockerfile new file mode 100644 index 0000000000..bb7151c8ba --- /dev/null +++ b/DocSummarization/backend/Dockerfile @@ -0,0 +1,35 @@ +FROM python:3.11-slim + +WORKDIR /app + +# Create a non-root user and group +RUN groupadd --system app && useradd --system --gid app app + +# Install system dependencies +RUN apt-get update && apt-get install -y \ + curl \ + tesseract-ocr \ + tesseract-ocr-eng \ + poppler-utils \ + --no-install-recommends \ + && rm -rf /var/lib/apt/lists/* + +# Copy requirements first for better layer caching +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY --chown=app:app . . + +# Switch to the non-root user +USER app + +# Expose port +EXPOSE 8000 + +# Health check +HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ + CMD curl -f "http://localhost:8000/health" || exit 1 + +# Run the application +CMD ["python", "server.py"] diff --git a/DocSummarization/backend/api/__init__.py b/DocSummarization/backend/api/__init__.py new file mode 100644 index 0000000000..8b32ea0b6c --- /dev/null +++ b/DocSummarization/backend/api/__init__.py @@ -0,0 +1,5 @@ +"""API module - HTTP endpoints""" + +from .routes import router + +__all__ = ["router"] diff --git a/DocSummarization/backend/api/routes.py b/DocSummarization/backend/api/routes.py new file mode 100644 index 0000000000..fabbad2056 --- /dev/null +++ b/DocSummarization/backend/api/routes.py @@ -0,0 +1,201 @@ +""" +API Routes for Doc-Sum Application +Handles all HTTP endpoints +""" + +from fastapi import APIRouter, Form, File, UploadFile, HTTPException +from fastapi.responses import StreamingResponse +from typing import Optional +import os +import logging +import json + +from services import pdf_service, llm_service +import config +from models import HealthResponse + +logger = logging.getLogger(__name__) + +router = APIRouter() + + +@router.get("/health", response_model=HealthResponse) +async def health_check(): + """Health check endpoint - dynamically checks service configuration""" + response = { + "status": "healthy", + "service": config.APP_TITLE, + "version": config.APP_VERSION + } + + # Only show llm_provider if Keycloak is actually configured + if config.BASE_URL and config.KEYCLOAK_CLIENT_SECRET: + response["llm_provider"] = "Enterprise Inference (Keycloak)" + + return response + + +@router.post("/v1/docsum") +async def summarize_document( + type: str = Form(...), + messages: str = Form(""), + max_tokens: int = Form(1024), + language: str = Form("en"), + summary_type: str = Form("auto"), + stream: str = Form("false"), + files: Optional[UploadFile] = File(None) +): + """ + Summarize text or PDF document content + + Args: + type: Input type (text, pdf) + messages: Text content (for type='text') + max_tokens: Maximum summary length + language: Language code + summary_type: Type of summary + stream: Enable streaming response + files: Uploaded file (for pdf documents) + + Returns: + Summary response with text + """ + try: + stream_bool = stream.lower() == "true" + + logger.info(f"Request received - type: {type}, has_file: {files is not None}, messages_len: {len(messages)}") + + # ========== Text Input ========== + if type == "text" and messages.strip(): + logger.info("Processing text input") + summary = llm_service.summarize( + text=messages, + max_tokens=max_tokens, + stream=stream_bool + ) + + if stream_bool: + return StreamingResponse( + _format_stream(summary), + media_type="text/event-stream" + ) + else: + return { + "text": summary, + "summary": summary, + "word_count": len(summary.split()), + "char_count": len(summary) + } + + # ========== File Upload (Documents) ========== + if files: + # Save file temporarily + temp_path = f"/tmp/{files.filename}" + filename_lower = files.filename.lower() + logger.info(f"Saving uploaded file: {files.filename}, type={type}") + + with open(temp_path, "wb") as buffer: + content = await files.read() + buffer.write(content) + + try: + # ===== Document Processing (PDF/DOC/DOCX/TXT) ===== + # Check file extension to determine how to extract text + if filename_lower.endswith(('.pdf', '.docx', '.doc')): + file_type = "PDF" if filename_lower.endswith('.pdf') else "DOCX" + logger.info(f"Extracting text from {file_type} file") + text_content = pdf_service.extract_text(temp_path) + os.remove(temp_path) + + if not text_content.strip(): + raise HTTPException(status_code=400, detail=f"No text found in {file_type}") + + logger.info(f"Extracted {len(text_content)} characters, generating summary") + summary = llm_service.summarize( + text=text_content, + max_tokens=max_tokens, + stream=stream_bool + ) + + if stream_bool: + return StreamingResponse( + _format_stream(summary), + media_type="text/event-stream" + ) + else: + return { + "text": summary, + "summary": summary, + "word_count": len(summary.split()), + "char_count": len(summary) + } + + elif filename_lower.endswith('.txt'): + logger.info("Reading text from TXT file") + with open(temp_path, "r", encoding="utf-8") as f: + text_content = f.read() + os.remove(temp_path) + + if not text_content.strip(): + raise HTTPException(status_code=400, detail="No text found in file") + + logger.info(f"Read {len(text_content)} characters, generating summary") + summary = llm_service.summarize( + text=text_content, + max_tokens=max_tokens, + stream=stream_bool + ) + + if stream_bool: + return StreamingResponse( + _format_stream(summary), + media_type="text/event-stream" + ) + else: + return { + "text": summary, + "summary": summary, + "word_count": len(summary.split()), + "char_count": len(summary) + } + + else: + logger.error(f"Unsupported file type: {files.filename}") + os.remove(temp_path) + raise HTTPException(status_code=400, detail=f"Unsupported file type. Please upload PDF, DOCX, or TXT files.") + + except Exception as e: + # Clean up file on error + if os.path.exists(temp_path): + os.remove(temp_path) + raise + + # ========== Invalid Request ========== + raise HTTPException( + status_code=400, + detail="Either text message or file is required" + ) + + except HTTPException: + raise + except Exception as e: + logger.error(f"Summarization error: {str(e)}") + raise HTTPException(status_code=500, detail=f"Summarization error: {str(e)}") + + +def _format_stream(generator): + """ + Format streaming response for SSE + + Args: + generator: Text chunk generator + + Yields: + Formatted SSE data chunks + """ + try: + for chunk in generator: + yield f"data: {json.dumps({'text': chunk})}\n\n" + yield "data: [DONE]\n\n" + except Exception as e: + yield f"data: {json.dumps({'error': str(e)})}\n\n" diff --git a/DocSummarization/backend/api_client.py b/DocSummarization/backend/api_client.py new file mode 100644 index 0000000000..1744915ecd --- /dev/null +++ b/DocSummarization/backend/api_client.py @@ -0,0 +1,68 @@ +import requests +import httpx +import logging +from openai import OpenAI +import config + +logger = logging.getLogger(__name__) + + +class APIClient: + def __init__(self): + self.base_url = config.BASE_URL + self.token = None + self.http_client = None + + if self.base_url and config.KEYCLOAK_CLIENT_SECRET: + self._authenticate() + + def _authenticate(self) -> None: + """Authenticate and obtain access token from Keycloak""" + try: + token_url = f"{self.base_url}/token" + logger.info(f"Authenticating with Keycloak at {token_url}") + + payload = { + "grant_type": "client_credentials", + "client_id": config.KEYCLOAK_CLIENT_ID, + "client_secret": config.KEYCLOAK_CLIENT_SECRET, + } + + response = requests.post(token_url, data=payload, verify=False) + + if response.status_code == 200: + self.token = response.json().get("access_token") + self.http_client = httpx.Client(verify=False) + logger.info("Authentication successful") + else: + raise Exception(f"Authentication failed: {response.status_code} - {response.text}") + + except Exception as e: + logger.error(f"Authentication error: {str(e)}") + raise + + def get_inference_client(self): + """Get OpenAI-style client for inference/completions""" + if not self.token or not self.http_client: + raise ValueError("API client not authenticated") + + return OpenAI( + api_key=self.token, + base_url=f"{self.base_url}/{config.INFERENCE_MODEL_ENDPOINT}/v1", + http_client=self.http_client + ) + + def is_authenticated(self) -> bool: + """Check if client is authenticated""" + return self.token is not None and self.http_client is not None + + +# Global instance +api_client = None + +def get_api_client(): + """Get or create global API client instance""" + global api_client + if api_client is None: + api_client = APIClient() + return api_client diff --git a/DocSummarization/backend/config.py b/DocSummarization/backend/config.py new file mode 100644 index 0000000000..3e3350efaf --- /dev/null +++ b/DocSummarization/backend/config.py @@ -0,0 +1,49 @@ +""" +Configuration settings for Doc-Sum Application +""" + +import os +from dotenv import load_dotenv + +# Load environment variables from .env file +load_dotenv() + +# Enterprise/Keycloak Configuration (Required for LLM) +BASE_URL = os.getenv("BASE_URL") +KEYCLOAK_REALM = os.getenv("KEYCLOAK_REALM", "master") +KEYCLOAK_CLIENT_ID = os.getenv("KEYCLOAK_CLIENT_ID", "api") +KEYCLOAK_CLIENT_SECRET = os.getenv("KEYCLOAK_CLIENT_SECRET") + +# Model Configuration (Enterprise Inference) +INFERENCE_MODEL_ENDPOINT = os.getenv("INFERENCE_MODEL_ENDPOINT", "Llama-3.1-8B-Instruct") +INFERENCE_MODEL_NAME = os.getenv("INFERENCE_MODEL_NAME", "meta-llama/Llama-3.1-8B-Instruct") + +# LLM Configuration +LLM_TEMPERATURE = float(os.getenv("LLM_TEMPERATURE", "0.7")) +LLM_MAX_TOKENS = int(os.getenv("LLM_MAX_TOKENS", "2000")) + +# Validate configuration (checked at runtime, not on import) +# KEYCLOAK_CLIENT_SECRET is required for text summarization + +# Application Settings +APP_TITLE = "Document Summarization Service" +APP_DESCRIPTION = "AI-powered document summarization with enterprise inference integration" +APP_VERSION = "2.0.0" + +# Service Configuration +SERVICE_PORT = int(os.getenv("SERVICE_PORT", "8000")) +LOG_LEVEL = os.getenv("LOG_LEVEL", "INFO") + +# File Upload Settings +MAX_FILE_SIZE = int(os.getenv("MAX_FILE_SIZE", str(500 * 1024 * 1024))) # 500MB +MAX_PDF_SIZE = int(os.getenv("MAX_PDF_SIZE", str(50 * 1024 * 1024))) # 50MB + +# File Processing Limits +MAX_PDF_PAGES = int(os.getenv("MAX_PDF_PAGES", "100")) # Maximum pages to process from PDF +WARN_PDF_PAGES = 50 # Warn user if PDF has more than this many pages + +# CORS Settings +CORS_ORIGINS = os.getenv("CORS_ORIGINS", "*") +CORS_ALLOW_CREDENTIALS = True +CORS_ALLOW_METHODS = ["*"] +CORS_ALLOW_HEADERS = ["*"] diff --git a/DocSummarization/backend/core/__init__.py b/DocSummarization/backend/core/__init__.py new file mode 100644 index 0000000000..60f3099bf3 --- /dev/null +++ b/DocSummarization/backend/core/__init__.py @@ -0,0 +1,5 @@ +"""Core module - Configuration and models""" + +from .config import settings + +__all__ = ["settings"] diff --git a/DocSummarization/backend/core/config.py.old b/DocSummarization/backend/core/config.py.old new file mode 100644 index 0000000000..54fb92bd68 --- /dev/null +++ b/DocSummarization/backend/core/config.py.old @@ -0,0 +1,107 @@ +""" +Configuration Management for Doc-Sum Application +Supports swapping between different LLM and Audio providers +""" + +from pydantic_settings import BaseSettings +from typing import Optional, Literal + + +class Settings(BaseSettings): + """Application Settings - Swap endpoints here for different deployments""" + + # ==================== Service Info ==================== + SERVICE_NAME: str = "Document Summarization Service" + SERVICE_VERSION: str = "1.0.0" + SERVICE_PORT: int = 8000 + + # ==================== LLM Provider Configuration ==================== + # Supported: "openai", "groq", "ollama", "vllm", "tgi" + LLM_PROVIDER: Literal["openai", "groq", "ollama", "vllm", "tgi"] = "openai" + + # LLM Endpoint URLs (swap based on deployment) + LLM_ENDPOINT: str = "https://api.openai.com/v1" # Change for local models + LLM_API_KEY: Optional[str] = None + LLM_MODEL: str = "gpt-4o-mini" + + # LLM Generation Parameters + LLM_TEMPERATURE: float = 0.7 + LLM_MAX_TOKENS: int = 2000 + LLM_MAX_RETRIES: int = 3 + + # ==================== Audio Provider Configuration ==================== + # Supported: "openai", "whisper-local", "faster-whisper" + AUDIO_PROVIDER: Literal["openai", "whisper-local", "faster-whisper"] = "openai" + + # Audio Endpoint (for OpenAI Whisper API) + AUDIO_ENDPOINT: str = "https://api.openai.com/v1" + AUDIO_API_KEY: Optional[str] = None + AUDIO_MODEL: str = "whisper-1" + + # ==================== PDF Processing ==================== + MAX_PDF_SIZE_MB: int = 50 + PDF_EXTRACT_IMAGES: bool = False # OCR support (future) + + # ==================== File Upload Limits ==================== + MAX_FILE_SIZE_MB: int = 100 + ALLOWED_AUDIO_FORMATS: list = [".mp3", ".wav", ".m4a", ".flac", ".ogg"] + ALLOWED_VIDEO_FORMATS: list = [".mp4", ".avi", ".mov", ".mkv"] + ALLOWED_PDF_FORMATS: list = [".pdf"] + + # ==================== CORS Configuration ==================== + CORS_ORIGINS: str = "*" # Comma-separated list or "*" for all origins + + # ==================== Logging ==================== + LOG_LEVEL: str = "INFO" + + class Config: + env_file = ".env" + case_sensitive = True + + # ==================== Deployment Presets ==================== + + @classmethod + def for_openai(cls): + """Preset for OpenAI deployment""" + return cls( + LLM_PROVIDER="openai", + LLM_ENDPOINT="https://api.openai.com/v1", + AUDIO_PROVIDER="openai", + AUDIO_ENDPOINT="https://api.openai.com/v1" + ) + + @classmethod + def for_groq(cls): + """Preset for Groq deployment""" + return cls( + LLM_PROVIDER="groq", + LLM_ENDPOINT="https://api.groq.com/openai/v1", + AUDIO_PROVIDER="openai", # Groq doesn't have audio, use OpenAI + AUDIO_ENDPOINT="https://api.openai.com/v1" + ) + + @classmethod + def for_local_ollama(cls): + """Preset for local Ollama deployment""" + return cls( + LLM_PROVIDER="ollama", + LLM_ENDPOINT="http://localhost:11434", + LLM_MODEL="llama3.2", + AUDIO_PROVIDER="whisper-local", + AUDIO_ENDPOINT="http://localhost:8000" + ) + + @classmethod + def for_local_vllm(cls): + """Preset for local vLLM deployment (AMD/NVIDIA GPUs)""" + return cls( + LLM_PROVIDER="vllm", + LLM_ENDPOINT="http://localhost:8000/v1", + LLM_MODEL="meta-llama/Llama-3.2-3B-Instruct", + AUDIO_PROVIDER="faster-whisper", + AUDIO_ENDPOINT="http://localhost:8001" + ) + + +# Global settings instance +settings = Settings() diff --git a/DocSummarization/backend/core/models.py b/DocSummarization/backend/core/models.py new file mode 100644 index 0000000000..40179c5001 --- /dev/null +++ b/DocSummarization/backend/core/models.py @@ -0,0 +1,33 @@ +""" +Data Models for Doc-Sum API +""" + +from pydantic import BaseModel, Field +from typing import Optional, Literal + + +class SummarizeRequest(BaseModel): + """Request model for summarization""" + type: Literal["text", "pdf", "audio", "video"] + messages: Optional[str] = "" + max_tokens: int = Field(default=1024, ge=100, le=4000) + language: str = "en" + summary_type: str = "auto" + stream: bool = False + + +class SummarizeResponse(BaseModel): + """Response model for summarization""" + text: str + summary: str # Kept for backward compatibility + word_count: Optional[int] = None + char_count: Optional[int] = None + + +class HealthResponse(BaseModel): + """Health check response""" + status: str + service: str + version: str + llm_provider: str + audio_provider: str diff --git a/DocSummarization/backend/models.py b/DocSummarization/backend/models.py new file mode 100644 index 0000000000..db833bd8b3 --- /dev/null +++ b/DocSummarization/backend/models.py @@ -0,0 +1,32 @@ +""" +Data Models for Doc-Sum API +""" + +from pydantic import BaseModel, Field +from typing import Optional, Literal + + +class SummarizeRequest(BaseModel): + """Request model for summarization""" + type: Literal["text", "pdf"] + messages: Optional[str] = "" + max_tokens: int = Field(default=1024, ge=100, le=4000) + language: str = "en" + summary_type: str = "auto" + stream: bool = False + + +class SummarizeResponse(BaseModel): + """Response model for summarization""" + text: str + summary: str # Kept for backward compatibility + word_count: Optional[int] = None + char_count: Optional[int] = None + + +class HealthResponse(BaseModel): + """Health check response""" + status: str + service: str + version: str + llm_provider: Optional[str] = None diff --git a/DocSummarization/backend/requirements.txt b/DocSummarization/backend/requirements.txt new file mode 100644 index 0000000000..854f811011 --- /dev/null +++ b/DocSummarization/backend/requirements.txt @@ -0,0 +1,23 @@ +# Core Web Framework +fastapi==0.109.0 +uvicorn[standard]==0.27.0 +python-multipart==0.0.6 + +# LLM & AI APIs +httpx==0.27.2 +requests==2.31.0 +openai==1.54.0 # Required for OpenAI-compatible API interface to Enterprise Inference + +# Document Processing +pypdf==6.1.1 +python-docx==1.1.0 +pdf2image==1.16.3 +pytesseract==0.3.10 +Pillow==10.2.0 + +# Configuration Management +python-dotenv==1.0.0 +pydantic==2.5.3 + +# Utilities +aiofiles==23.2.1 \ No newline at end of file diff --git a/DocSummarization/backend/server.py b/DocSummarization/backend/server.py new file mode 100644 index 0000000000..388b39f193 --- /dev/null +++ b/DocSummarization/backend/server.py @@ -0,0 +1,98 @@ +""" +FastAPI server for Doc-Sum Application +""" + +import logging +from fastapi import FastAPI +from fastapi.middleware.cors import CORSMiddleware +import uvicorn + +import config +from models import HealthResponse +from api.routes import router + +# Configure logging +logging.basicConfig( + level=getattr(logging, config.LOG_LEVEL), + format='%(asctime)s - %(name)s - %(levelname)s - %(message)s' +) +logger = logging.getLogger(__name__) + +# Initialize FastAPI app +app = FastAPI( + title=config.APP_TITLE, + description=config.APP_DESCRIPTION, + version=config.APP_VERSION +) + +# Add CORS middleware +app.add_middleware( + CORSMiddleware, + allow_origins=config.CORS_ORIGINS.split(",") if config.CORS_ORIGINS != "*" else ["*"], + allow_credentials=config.CORS_ALLOW_CREDENTIALS, + allow_methods=config.CORS_ALLOW_METHODS, + allow_headers=config.CORS_ALLOW_HEADERS, +) + +# Include API routes +app.include_router(router) + +# Root endpoint +@app.get("/") +def root(): + """Root endpoint with service info""" + response = { + "message": "Document Summarization Service is running", + "version": config.APP_VERSION, + "status": "healthy", + "docs": "/docs", + "health": "/health" + } + + # Only show config if services are actually configured + if config.BASE_URL and config.KEYCLOAK_CLIENT_SECRET: + response["config"] = { + "llm_provider": "Enterprise Inference (Keycloak)", + "llm_model": config.INFERENCE_MODEL_NAME + } + + return response + +# Health check endpoint +@app.get("/health", response_model=HealthResponse) +def health_check(): + """Detailed health check - dynamically checks service configuration""" + response_data = { + "status": "healthy", + "service": config.APP_TITLE, + "version": config.APP_VERSION + } + + # Only show llm_provider if Keycloak is actually configured + if config.BASE_URL and config.KEYCLOAK_CLIENT_SECRET: + response_data["llm_provider"] = "Enterprise Inference (Keycloak)" + + return HealthResponse(**response_data) + +# Startup event +@app.on_event("startup") +async def startup_event(): + """Log configuration on startup""" + logger.info("=" * 60) + logger.info(f"Starting {config.APP_TITLE} v{config.APP_VERSION}") + logger.info("=" * 60) + logger.info("LLM Provider: Enterprise Inference (Keycloak)") + logger.info(f"Base URL: {config.BASE_URL}") + logger.info(f"Keycloak Configured: {bool(config.KEYCLOAK_CLIENT_SECRET)}") + logger.info(f"Model: {config.INFERENCE_MODEL_NAME}") + logger.info(f"Port: {config.SERVICE_PORT}") + logger.info("=" * 60) + +# Entry point for running with uvicorn +if __name__ == "__main__": + uvicorn.run( + app, + host="0.0.0.0", + port=config.SERVICE_PORT, + timeout_keep_alive=300 + ) diff --git a/DocSummarization/backend/services/__init__.py b/DocSummarization/backend/services/__init__.py new file mode 100644 index 0000000000..b790f73d82 --- /dev/null +++ b/DocSummarization/backend/services/__init__.py @@ -0,0 +1,6 @@ +"""Services module - Business logic layer""" + +from .pdf_service import pdf_service +from .llm_service import llm_service + +__all__ = ["pdf_service", "llm_service"] diff --git a/DocSummarization/backend/services/llm_service.py b/DocSummarization/backend/services/llm_service.py new file mode 100644 index 0000000000..82fdadf76f --- /dev/null +++ b/DocSummarization/backend/services/llm_service.py @@ -0,0 +1,205 @@ +""" +LLM Service for Document Summarization +Uses Enterprise Inference API via Keycloak authentication +""" + +from typing import Iterator, Dict, Any +import logging +import re +import config +from api_client import get_api_client + +logger = logging.getLogger(__name__) + + +def clean_markdown_formatting(text: str) -> str: + """ + Remove markdown formatting symbols from text + + Args: + text: Text that may contain markdown formatting + + Returns: + Clean text without markdown symbols + """ + # Remove bold (**text** or __text__) + text = re.sub(r'\*\*(.+?)\*\*', r'\1', text) + text = re.sub(r'__(.+?)__', r'\1', text) + + # Remove italic (*text* or _text_) + text = re.sub(r'\*(.+?)\*', r'\1', text) + text = re.sub(r'_(.+?)_', r'\1', text) + + # Remove code blocks (```text```) + text = re.sub(r'```.*?```', '', text, flags=re.DOTALL) + + # Remove inline code (`text`) + text = re.sub(r'`(.+?)`', r'\1', text) + + # Remove headers (# text) + text = re.sub(r'^#+\s+', '', text, flags=re.MULTILINE) + + # Remove bullet points and list markers + text = re.sub(r'^\s*[-*+]\s+', '', text, flags=re.MULTILINE) + text = re.sub(r'^\s*\d+\.\s+', '', text, flags=re.MULTILINE) + + return text.strip() + + +class LLMService: + """ + LLM service for document summarization using enterprise inference + """ + + def __init__(self): + """Initialize LLM service (authenticates on first use)""" + self.client = None + self.model = config.INFERENCE_MODEL_NAME + self._authenticated = False + + def _ensure_authenticated(self): + """Authenticate with enterprise API (lazy initialization)""" + if self._authenticated: + return + + if not config.KEYCLOAK_CLIENT_SECRET: + raise ValueError("KEYCLOAK_CLIENT_SECRET must be set in environment variables for text summarization") + + logger.info("Initializing LLM Service with Enterprise API") + logger.info(f"Base URL: {config.BASE_URL}") + logger.info(f"Model: {config.INFERENCE_MODEL_NAME}") + + # Use enterprise API client + api_client = get_api_client() + if not api_client.is_authenticated(): + raise ValueError("Enterprise API authentication failed - check Keycloak credentials") + + self.client = api_client.get_inference_client() + self._authenticated = True + logger.info("Enterprise API client initialized successfully") + + def summarize( + self, + text: str, + max_tokens: int = None, + temperature: float = None, + stream: bool = False + ) -> str | Iterator[str]: + """ + Summarize text using enterprise LLM + + Args: + text: Text to summarize + max_tokens: Maximum tokens in summary + temperature: Generation temperature + stream: Whether to stream response + + Returns: + Summary text or iterator of chunks if streaming + """ + # Ensure we're authenticated before making API calls + self._ensure_authenticated() + + max_tokens = max_tokens or config.LLM_MAX_TOKENS + temperature = temperature or config.LLM_TEMPERATURE + + system_prompt = """You are a professional document summarizer. +Your task is to create clear, concise, and accurate summaries of the provided text. +Focus on the main points and key information while maintaining the original meaning. + +IMPORTANT: Provide the summary in plain text format only. Do not use any markdown formatting symbols like **, *, _, or other special characters for formatting. Write in a clean, readable paragraph format.""" + + user_prompt = f"""Please provide a comprehensive summary of the following text: + +{text} + +Summary:""" + + try: + logger.info(f"Generating summary with {self.model} (stream={stream})") + + response = self.client.chat.completions.create( + model=self.model, + messages=[ + {"role": "system", "content": system_prompt}, + {"role": "user", "content": user_prompt} + ], + max_tokens=max_tokens, + temperature=temperature, + stream=stream + ) + + if stream: + return self._stream_response(response) + else: + summary = response.choices[0].message.content + # Clean any markdown formatting from the response + summary = clean_markdown_formatting(summary) + logger.info(f"Generated summary: {len(summary)} characters") + return summary + + except Exception as e: + logger.error(f"LLM summarization error: {str(e)}") + raise Exception(f"Failed to generate summary: {str(e)}") + + def _stream_response(self, response) -> Iterator[str]: + """Stream LLM response chunks (with markdown cleaning)""" + accumulated = "" + for chunk in response: + if chunk.choices[0].delta.content: + accumulated += chunk.choices[0].delta.content + # Only yield when we have a complete sentence or paragraph + if accumulated.endswith(('.', '!', '?', '\n')): + cleaned = clean_markdown_formatting(accumulated) + yield cleaned + accumulated = "" + + # Yield any remaining content + if accumulated: + cleaned = clean_markdown_formatting(accumulated) + yield cleaned + + def health_check(self) -> Dict[str, Any]: + """ + Check if LLM service is healthy + + Returns: + Health status dictionary + """ + try: + # Check if Keycloak is configured + if not config.KEYCLOAK_CLIENT_SECRET: + return { + "status": "not_configured", + "provider": "Enterprise Inference (Keycloak)", + "message": "Keycloak credentials not configured" + } + + # Ensure authenticated + self._ensure_authenticated() + + # Try a simple completion + response = self.client.chat.completions.create( + model=self.model, + messages=[{"role": "user", "content": "Say 'OK'"}], + max_tokens=10 + ) + + return { + "status": "healthy", + "provider": "Enterprise Inference (Keycloak)", + "model": self.model, + "base_url": config.BASE_URL + } + + except Exception as e: + logger.error(f"Health check failed: {str(e)}") + return { + "status": "unhealthy", + "provider": "Enterprise Inference (Keycloak)", + "error": str(e) + } + + +# Global LLM service instance +llm_service = LLMService() diff --git a/DocSummarization/backend/services/pdf_service.py b/DocSummarization/backend/services/pdf_service.py new file mode 100644 index 0000000000..d3e245f29d --- /dev/null +++ b/DocSummarization/backend/services/pdf_service.py @@ -0,0 +1,202 @@ +""" +Document Extraction Service +Handles PDF and DOCX text extraction with OCR support for image-based PDFs +""" + +from pypdf import PdfReader +from docx import Document +from pdf2image import convert_from_path +import pytesseract +from typing import Optional +import logging +import os +import config + +logger = logging.getLogger(__name__) + + +class PDFService: + """Service for extracting text from PDF and DOCX files""" + + def __init__(self): + """Initialize document extraction service""" + logger.info("Document Extraction Service initialized") + + def extract_text(self, file_path: str) -> str: + """ + Extract text from PDF or DOCX file + + Args: + file_path: Path to document file + + Returns: + Extracted text content + + Raises: + Exception: If document extraction fails + """ + try: + filename_lower = file_path.lower() + + if filename_lower.endswith('.pdf'): + return self._extract_from_pdf(file_path) + elif filename_lower.endswith(('.docx', '.doc')): + return self._extract_from_docx(file_path) + else: + raise Exception(f"Unsupported file type: {file_path}") + + except Exception as e: + logger.error(f"Document extraction error: {str(e)}") + raise Exception(f"Failed to extract text from document: {str(e)}") + + def _extract_from_pdf(self, pdf_path: str) -> str: + """ + Extract text from PDF file with automatic OCR fallback for image-based PDFs + Implements page limit to prevent processing extremely large PDFs + + Args: + pdf_path: Path to PDF file + + Returns: + Extracted text content + """ + logger.info(f"Extracting text from PDF: {pdf_path}") + + text_content = "" + + # First, try standard text extraction + with open(pdf_path, "rb") as file: + pdf_reader = PdfReader(file) + num_pages = len(pdf_reader.pages) + + logger.info(f"PDF has {num_pages} pages") + + # Apply page limit + max_pages = config.MAX_PDF_PAGES + pages_to_process = min(num_pages, max_pages) + + if num_pages > max_pages: + logger.warning(f"PDF has {num_pages} pages. Processing only first {max_pages} pages to prevent timeout.") + text_content += f"[Note: This PDF has {num_pages} pages. Processing first {max_pages} pages only.]\n\n" + + for page_num, page in enumerate(pdf_reader.pages[:pages_to_process], 1): + page_text = page.extract_text() + text_content += page_text + "\n" + logger.debug(f"Extracted {len(page_text)} characters from page {page_num}") + + extracted_length = len(text_content.strip()) + logger.info(f"Extracted {extracted_length} characters from PDF") + + # If no text was extracted, the PDF is likely image-based - use OCR + if extracted_length < 50: # Threshold for considering PDF as image-based + logger.info("PDF appears to be image-based or has minimal text. Using OCR...") + text_content = self._extract_with_ocr(pdf_path, max_pages=pages_to_process) + extracted_length = len(text_content.strip()) + logger.info(f"OCR extracted {extracted_length} characters from PDF") + + return text_content.strip() + + def _extract_with_ocr(self, pdf_path: str, max_pages: Optional[int] = None) -> str: + """ + Extract text from PDF using OCR (for image-based PDFs) + + Args: + pdf_path: Path to PDF file + max_pages: Maximum number of pages to process (None = all pages) + + Returns: + Extracted text using OCR + """ + try: + logger.info(f"Starting OCR extraction for: {pdf_path}") + + # Convert PDF pages to images + images = convert_from_path(pdf_path, dpi=300) + total_pages = len(images) + logger.info(f"Converted PDF to {total_pages} images") + + # Apply page limit if specified + if max_pages and total_pages > max_pages: + logger.warning(f"OCR: Processing only first {max_pages} of {total_pages} pages") + images = images[:max_pages] + + text_content = "" + pages_to_process = len(images) + + # Perform OCR on each page + for page_num, image in enumerate(images, 1): + logger.info(f"Running OCR on page {page_num}/{pages_to_process}") + page_text = pytesseract.image_to_string(image) + text_content += page_text + "\n" + logger.debug(f"OCR extracted {len(page_text)} characters from page {page_num}") + + extracted_length = len(text_content.strip()) + logger.info(f"OCR successfully extracted {extracted_length} characters") + + return text_content.strip() + + except Exception as e: + logger.error(f"OCR extraction failed: {str(e)}") + raise Exception(f"Failed to extract text using OCR: {str(e)}") + + def _extract_from_docx(self, docx_path: str) -> str: + """ + Extract text from DOCX file + + Args: + docx_path: Path to DOCX file + + Returns: + Extracted text content + """ + logger.info(f"Extracting text from DOCX: {docx_path}") + + text_content = "" + + doc = Document(docx_path) + + # Extract text from all paragraphs + for paragraph in doc.paragraphs: + text_content += paragraph.text + "\n" + + # Extract text from tables if present + for table in doc.tables: + for row in table.rows: + for cell in row.cells: + text_content += cell.text + " " + text_content += "\n" + + extracted_length = len(text_content.strip()) + logger.info(f"Successfully extracted {extracted_length} characters from DOCX") + + return text_content.strip() + + def get_pdf_metadata(self, pdf_path: str) -> dict: + """ + Get PDF metadata (title, author, pages, etc.) + + Args: + pdf_path: Path to PDF file + + Returns: + Dictionary with PDF metadata + """ + try: + with open(pdf_path, "rb") as file: + pdf_reader = PdfReader(file) + + metadata = { + "num_pages": len(pdf_reader.pages), + "title": pdf_reader.metadata.get("/Title", "Unknown") if pdf_reader.metadata else "Unknown", + "author": pdf_reader.metadata.get("/Author", "Unknown") if pdf_reader.metadata else "Unknown", + } + + return metadata + + except Exception as e: + logger.error(f"Error getting PDF metadata: {str(e)}") + return {} + + +# Global PDF service instance +pdf_service = PDFService() diff --git a/DocSummarization/docker-compose.yml b/DocSummarization/docker-compose.yml new file mode 100644 index 0000000000..65eff240ae --- /dev/null +++ b/DocSummarization/docker-compose.yml @@ -0,0 +1,56 @@ +services: + # Backend Gateway (Python) + backend: + build: + context: ./backend + dockerfile: Dockerfile + container_name: backend + ports: + - "8000:8000" + env_file: + - ./backend/.env + volumes: + - ./backend:/app + networks: + - app_network + restart: unless-stopped + + # Frontend (React) + frontend: + build: + context: ./frontend + dockerfile: Dockerfile + container_name: frontend + ports: + - "5173:80" + depends_on: + - backend + networks: + - app_network + restart: unless-stopped + +networks: + app_network: + driver: bridge + +# ============================================================================= +# Usage Instructions: +# ============================================================================= +# +# 1. Copy backend/.env.example to backend/.env and configure your Keycloak credentials +# +# 2. Start the application: +# docker-compose up -d +# +# 3. Access the application: +# - Frontend: http://localhost:5173 +# - Backend API: http://localhost:8000 +# - API Docs: http://localhost:8000/docs +# +# 4. View logs: +# docker-compose logs -f +# +# 5. Stop the application: +# docker-compose down +# +# ============================================================================= diff --git a/DocSummarization/frontend/.dockerignore b/DocSummarization/frontend/.dockerignore new file mode 100644 index 0000000000..cd2b1a1eb1 --- /dev/null +++ b/DocSummarization/frontend/.dockerignore @@ -0,0 +1,6 @@ +node_modules +dist +.git +.env +.DS_Store +npm-debug.log diff --git a/DocSummarization/frontend/Dockerfile b/DocSummarization/frontend/Dockerfile new file mode 100644 index 0000000000..a2cffd1f1d --- /dev/null +++ b/DocSummarization/frontend/Dockerfile @@ -0,0 +1,58 @@ +# Build stage +FROM node:18-alpine AS build + +WORKDIR /app + +COPY package*.json ./ +RUN npm install + +COPY . . +RUN npm run build + +# Production stage +FROM nginx:alpine + +COPY --from=build /app/dist /usr/share/nginx/html + +# Create nginx config with increased file upload limit and no-cache headers +RUN echo 'server { \ + listen 80; \ + server_name localhost; \ + client_max_body_size 500M; \ + \ + location / { \ + root /usr/share/nginx/html; \ + index index.html; \ + try_files $uri $uri/ /index.html; \ + \ + # Disable cache for HTML \ + add_header Cache-Control "no-store, no-cache, must-revalidate, proxy-revalidate, max-age=0"; \ + add_header Pragma "no-cache"; \ + add_header Expires "0"; \ + } \ + \ + location ~* \.(js|css|png|jpg|jpeg|gif|ico|svg)$ { \ + root /usr/share/nginx/html; \ + expires -1; \ + add_header Cache-Control "no-store, no-cache, must-revalidate, proxy-revalidate, max-age=0"; \ + } \ + \ + location /v1/ { \ + proxy_pass http://backend:8000; \ + proxy_http_version 1.1; \ + proxy_set_header Upgrade $http_upgrade; \ + proxy_set_header Connection "upgrade"; \ + proxy_set_header Host $host; \ + proxy_set_header X-Real-IP $remote_addr; \ + proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for; \ + proxy_set_header X-Forwarded-Proto $scheme; \ + client_max_body_size 500M; \ + proxy_read_timeout 600s; \ + proxy_connect_timeout 600s; \ + proxy_send_timeout 600s; \ + } \ +}' > /etc/nginx/conf.d/default.conf + +EXPOSE 80 + +CMD ["nginx", "-g", "daemon off;"] diff --git a/DocSummarization/frontend/index.html b/DocSummarization/frontend/index.html new file mode 100644 index 0000000000..6e33b1c5cb --- /dev/null +++ b/DocSummarization/frontend/index.html @@ -0,0 +1,13 @@ + + + + + + + Document Summarization - Cloud2Labs + + +
+ + + diff --git a/DocSummarization/frontend/package.json b/DocSummarization/frontend/package.json new file mode 100644 index 0000000000..4edece2b37 --- /dev/null +++ b/DocSummarization/frontend/package.json @@ -0,0 +1,24 @@ +{ + "name": "docsum-react-ui", + "version": "1.0.0", + "type": "module", + "scripts": { + "dev": "vite", + "build": "vite build", + "preview": "vite preview" + }, + "dependencies": { + "react": "^18.2.0", + "react-dom": "^18.2.0", + "react-router-dom": "^6.20.0", + "lucide-react": "^0.294.0", + "react-hot-toast": "^2.4.1" + }, + "devDependencies": { + "@vitejs/plugin-react": "^4.2.0", + "vite": "^5.0.0", + "tailwindcss": "^3.3.5", + "autoprefixer": "^10.4.16", + "postcss": "^8.4.32" + } +} diff --git a/DocSummarization/frontend/postcss.config.js b/DocSummarization/frontend/postcss.config.js new file mode 100644 index 0000000000..2e7af2b7f1 --- /dev/null +++ b/DocSummarization/frontend/postcss.config.js @@ -0,0 +1,6 @@ +export default { + plugins: { + tailwindcss: {}, + autoprefixer: {}, + }, +} diff --git a/DocSummarization/frontend/public/assets/img/docsum-generate-page.png b/DocSummarization/frontend/public/assets/img/docsum-generate-page.png new file mode 100644 index 0000000000..839c68379b Binary files /dev/null and b/DocSummarization/frontend/public/assets/img/docsum-generate-page.png differ diff --git a/DocSummarization/frontend/public/assets/img/docsum-homepage1.png b/DocSummarization/frontend/public/assets/img/docsum-homepage1.png new file mode 100644 index 0000000000..af95b1a30a Binary files /dev/null and b/DocSummarization/frontend/public/assets/img/docsum-homepage1.png differ diff --git a/DocSummarization/frontend/public/assets/img/docsum-homepage2.png b/DocSummarization/frontend/public/assets/img/docsum-homepage2.png new file mode 100644 index 0000000000..f556397dbc Binary files /dev/null and b/DocSummarization/frontend/public/assets/img/docsum-homepage2.png differ diff --git a/DocSummarization/frontend/public/assets/img/ui-result-page.png b/DocSummarization/frontend/public/assets/img/ui-result-page.png new file mode 100644 index 0000000000..513ce033f6 Binary files /dev/null and b/DocSummarization/frontend/public/assets/img/ui-result-page.png differ diff --git a/DocSummarization/frontend/public/cloud2labs-logo.png b/DocSummarization/frontend/public/cloud2labs-logo.png new file mode 100644 index 0000000000..2a0ef60222 Binary files /dev/null and b/DocSummarization/frontend/public/cloud2labs-logo.png differ diff --git a/DocSummarization/frontend/src/App.jsx b/DocSummarization/frontend/src/App.jsx new file mode 100644 index 0000000000..8f5ee87aea --- /dev/null +++ b/DocSummarization/frontend/src/App.jsx @@ -0,0 +1,43 @@ +import { BrowserRouter, Routes, Route } from 'react-router-dom'; +import { Toaster } from 'react-hot-toast'; +import Layout from './components/layout/Layout'; +import Home from './pages/Home'; +import Generate from './pages/Generate'; + +function App() { + return ( + + + + }> + } /> + } /> + + + + ); +} + +export default App; diff --git a/DocSummarization/frontend/src/components/FileUpload.jsx b/DocSummarization/frontend/src/components/FileUpload.jsx new file mode 100644 index 0000000000..f524ba8469 --- /dev/null +++ b/DocSummarization/frontend/src/components/FileUpload.jsx @@ -0,0 +1,137 @@ +import { useState } from 'react'; +import { Upload, FileText, X } from 'lucide-react'; + +const FileUpload = ({ onSubmit, isLoading, acceptedTypes, fileType, title, maxFileSize }) => { + const [dragActive, setDragActive] = useState(false); + const [file, setFile] = useState(null); + + const handleDrag = (e) => { + e.preventDefault(); + e.stopPropagation(); + if (e.type === "dragenter" || e.type === "dragover") { + setDragActive(true); + } else if (e.type === "dragleave") { + setDragActive(false); + } + }; + + const handleDrop = (e) => { + e.preventDefault(); + e.stopPropagation(); + setDragActive(false); + + if (e.dataTransfer.files && e.dataTransfer.files[0]) { + const droppedFile = e.dataTransfer.files[0]; + const fileExtension = '.' + droppedFile.name.split('.').pop().toLowerCase(); + + if (acceptedTypes.includes(fileExtension)) { + setFile(droppedFile); + } + } + }; + + const handleChange = (e) => { + e.preventDefault(); + if (e.target.files && e.target.files[0]) { + setFile(e.target.files[0]); + } + }; + + const handleRemoveFile = () => { + setFile(null); + }; + + const handleSubmit = (e) => { + e.preventDefault(); + if (!file) return; + + const formData = new FormData(); + formData.append('type', fileType); + formData.append('messages', ''); + formData.append('files', file); + formData.append('max_tokens', 1024); + formData.append('language', 'en'); + formData.append('summary_type', 'auto'); + formData.append('stream', 'false'); + + onSubmit(formData, false); + }; + + return ( +
+
+ +

{title}

+
+ +
+
+ {!file ? ( + <> + +

+ Drop your file here or click to browse +

+

+ Supported formats: {acceptedTypes.join(', ')} +

+

+ Maximum file size: {maxFileSize || '50 MB'} +

+ + + + ) : ( +
+
+ +
+

{file.name}

+

+ {(file.size / 1024 / 1024).toFixed(2)} MB +

+
+
+ +
+ )} +
+ + +
+
+ ); +}; + +export default FileUpload; diff --git a/DocSummarization/frontend/src/components/TextInput.jsx b/DocSummarization/frontend/src/components/TextInput.jsx new file mode 100644 index 0000000000..065db85312 --- /dev/null +++ b/DocSummarization/frontend/src/components/TextInput.jsx @@ -0,0 +1,56 @@ +import { useState } from 'react'; +import { FileText } from 'lucide-react'; + +const TextInput = ({ onSubmit, isLoading }) => { + const [text, setText] = useState(''); + + const handleSubmit = (e) => { + e.preventDefault(); + if (!text.trim()) return; + + const formData = new FormData(); + formData.append('type', 'text'); + formData.append('messages', text); + formData.append('max_tokens', 1024); + formData.append('language', 'en'); + formData.append('summary_type', 'auto'); + formData.append('stream', 'false'); + + onSubmit(formData, false); + }; + + return ( +
+
+ +

Paste Text

+
+ +
+
+ +