FakeNewsDetection/export_utils.py at master · thisispit/FakeNewsDetection · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
import pandas as pd
from fpdf import FPDF
from datetime import datetime
from typing import List, Dict, Optional
import io

class PDFReport(FPDF):
    """Custom PDF report for fake news detection results."""

    def header(self):
        """Page header."""
        self.set_font('Arial', 'B', 16)
        self.cell(0, 10, 'Fake News Detection Report', 0, 1, 'C')
        self.ln(5)

    def footer(self):
        """Page footer."""
        self.set_y(-15)
        self.set_font('Arial', 'I', 8)
        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')

    def add_metadata(self, timestamp: str):
        """Add report metadata."""
        self.set_font('Arial', '', 10)
        self.cell(0, 8, f'Generated: {timestamp}', 0, 1)
        self.ln(5)

    def add_prediction_result(self, prediction: str, confidence: float,
                            real_prob: float, fake_prob: float):
        """Add prediction results section."""
        self.set_font('Arial', 'B', 14)
        self.cell(0, 10, 'Prediction Result', 0, 1)

        self.set_font('Arial', 'B', 12)

        # Set color based on prediction
        if prediction == 'FAKE NEWS':
            self.set_text_color(255, 0, 0)
        elif prediction == 'REAL NEWS':
            self.set_text_color(0, 128, 0)
        else:
            self.set_text_color(255, 165, 0)

        self.cell(0, 10, f'Result: {prediction}', 0, 1)
        self.set_text_color(0, 0, 0)

        self.set_font('Arial', '', 11)
        self.cell(0, 8, f'Confidence: {confidence*100:.2f}%', 0, 1)
        self.cell(0, 8, f'Real Probability: {real_prob*100:.2f}%', 0, 1)
        self.cell(0, 8, f'Fake Probability: {fake_prob*100:.2f}%', 0, 1)
        self.ln(5)

    def add_text_sample(self, text: str, max_length: int = 500):
        """Add analyzed text sample."""
        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Analyzed Text Sample', 0, 1)

        self.set_font('Arial', '', 10)

        # Truncate if too long
        display_text = text[:max_length]
        if len(text) > max_length:
            display_text += '...'

        # Multi-cell for word wrapping
        self.multi_cell(0, 6, display_text)
        self.ln(5)

    def add_explanation(self, features: List[tuple], prediction_class: str):
        """Add LIME explanation if available."""
        if not features:
            return

        self.set_font('Arial', 'B', 12)
        self.cell(0, 10, 'Top Influential Words', 0, 1)

        self.set_font('Arial', '', 10)
        self.cell(60, 8, 'Word/Phrase', 1, 0, 'C')
        self.cell(40, 8, 'Weight', 1, 0, 'C')
        self.cell(80, 8, 'Influence', 1, 1, 'C')

        for word, weight in features[:10]:
            self.cell(60, 8, word[:30], 1, 0)
            self.cell(40, 8, f'{weight:.4f}', 1, 0, 'C')

            if weight > 0:
                influence = f'Supports {prediction_class}'
            else:
                influence = f'Opposes {prediction_class}'

            self.cell(80, 8, influence, 1, 1)

        self.ln(5)

def generate_pdf_report(prediction: str, confidence: float, real_prob: float,
                       fake_prob: float, text: str, features: Optional[List[tuple]] = None) -> bytes:
    """
    Generate a PDF report for a single prediction.

    Args:
        prediction: Prediction result (REAL NEWS/FAKE NEWS)
        confidence: Confidence score
        real_prob: Real news probability
        fake_prob: Fake news probability
        text: Input text
        features: Optional LIME features

    Returns:
        PDF bytes
    """
    pdf = PDFReport()
    pdf.add_page()

    # Add metadata
    timestamp = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    pdf.add_metadata(timestamp)

    # Add prediction results
    pdf.add_prediction_result(prediction, confidence, real_prob, fake_prob)

    # Add text sample
    pdf.add_text_sample(text)

    # Add explanation if available
    if features:
        prediction_class = prediction.replace(' NEWS', '')
        pdf.add_explanation(features, prediction_class)

    # Get PDF bytes - fpdf2 returns bytearray, convert to bytes
    pdf_output = pdf.output()
    return bytes(pdf_output)

def create_csv_export(predictions: List[Dict]) -> str:
    """
    Create CSV export from predictions list.

    Args:
        predictions: List of prediction dictionaries

    Returns:
        CSV string
    """
    if not predictions:
        return ""

    df = pd.DataFrame(predictions)

    # Select and order columns
    columns = ['timestamp', 'prediction', 'confidence', 'real_prob', 'fake_prob',
               'input_method', 'input_text']

    # Only include columns that exist
    available_columns = [col for col in columns if col in df.columns]
    df = df[available_columns]

    # Format percentages
    if 'confidence' in df.columns:
        df['confidence'] = df['confidence'].apply(lambda x: f'{x*100:.2f}%')
    if 'real_prob' in df.columns:
        df['real_prob'] = df['real_prob'].apply(lambda x: f'{x*100:.2f}%')
    if 'fake_prob' in df.columns:
        df['fake_prob'] = df['fake_prob'].apply(lambda x: f'{x*100:.2f}%')

    # Convert to CSV
    return df.to_csv(index=False)

def process_batch_csv(uploaded_file) -> pd.DataFrame:
    """
    Process uploaded CSV file for batch predictions.

    Args:
        uploaded_file: Streamlit uploaded file object

    Returns:
        DataFrame with text column
    """
    try:
        df = pd.read_csv(uploaded_file)

        # Check for required column (text, article, content, headline, etc.)
        text_columns = ['text', 'article', 'content', 'headline', 'title', 'body']

        found_column = None
        for col in text_columns:
            if col in df.columns:
                found_column = col
                break

        if found_column:
            # Rename to 'text' for consistency
            df['text'] = df[found_column]
        else:
            # If no standard column found, use first column
            if len(df.columns) > 0:
                df['text'] = df[df.columns[0]]
            else:
                raise ValueError("No valid text column found in CSV")

        # Remove empty rows
        df = df[df['text'].notna()]
        df = df[df['text'].str.strip() != '']

        return df

    except Exception as e:
        raise ValueError(f"Error processing CSV: {str(e)}")