-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathpdf_processor.py
More file actions
64 lines (49 loc) · 2.28 KB
/
Copy pathpdf_processor.py
File metadata and controls
64 lines (49 loc) · 2.28 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# We are importing the PyPDF2 library
# It is a library that helps us in reading and extracting the text from PDF files
import PyPDF2
# we are importing io library
# io library helps us handle file data in memory
# When the user uploads a PDF through streamlit it comes as bytes (raw data)
# io helps us in converting those bytes into a readable file format
import io
# This is the main function that will extract the text from PDF file
# It takes one input : the uploaded PDF file
def extract_text_from_pdf(uploaded_file):
# uploaded file comes from streamlit as raw bytes
# PyPDF2 cannot read raw bytes directly
# So we wrap it in io.BytesIO to make it readable
# Think of it like putting the file in a proper folder so PyPDF2 can open it
pdf_file = io.BytesIO(uploaded_file.read())
# PdfReader is a class inside PyPDF2
# It opens and reads the PDF file
# We pass our pdf_file to it
pdf_reader = PyPDF2.PdfReader(pdf_file)
# We create an empty string to store all extracted text
# We will keep adding text from each page to this variable
extracted_text = ""
# pdf_reader.pages gives us a list of all pages in the PDF
# We loop through each page one by one
for page in pdf_reader.pages:
# page.extract_text() extracts all text from that single page
# We add it to our extracted_text variable
# \n adds a new line after each page so text is organized
extracted_text += page.extract_text() + "\n"
# After looping through all pages
# extracted_text now contains all text from the entire PDF
# We return it so other parts of our app can use it
return extracted_text
# This is a helper function that gives us basic info about the PDF
# It takes the uploaded file as input
def get_pdf_info(uploaded_file):
# Same as before — wrap bytes in BytesIO
pdf_file = io.BytesIO(uploaded_file.read())
# Open the PDF with PdfReader
pdf_reader = PyPDF2.PdfReader(pdf_file)
# Get total number of pages
# len(pdf_reader.pages) counts how many pages are in the PDF
total_pages = len(pdf_reader.pages)
# We return a dictionary with basic PDF information
# Dictionary is like a labeled container with key value pairs
return {
"total_pages": total_pages,
}