Document Text Extraction Tool Python, OCR
👤 Sharing: AI
```python
import pytesseract
from PIL import Image
import os
import io
from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document
def extract_text_from_image(image_path, lang='eng'):
"""
Extracts text from an image using OCR.
Args:
image_path (str): The path to the image file.
lang (str): Language code for OCR (e.g., 'eng' for English).
Returns:
str: The extracted text. Returns an empty string if there's an error.
"""
try:
img = Image.open(image_path)
text = pytesseract.image_to_string(img, lang=lang)
return text
except Exception as e:
print(f"Error processing image {image_path}: {e}")
return ""
def extract_text_from_pdf(pdf_path):
"""
Extracts text from a PDF file.
Args:
pdf_path (str): The path to the PDF file.
Returns:
str: The extracted text. Returns an empty string if there's an error.
"""
try:
text = pdf_extract_text(pdf_path)
return text
except Exception as e:
print(f"Error processing PDF {pdf_path}: {e}")
return ""
def extract_text_from_docx(docx_path):
"""
Extracts text from a DOCX file.
Args:
docx_path (str): The path to the DOCX file.
Returns:
str: The extracted text. Returns an empty string if there's an error.
"""
try:
doc = Document(docx_path)
full_text = []
for paragraph in doc.paragraphs:
full_text.append(paragraph.text)
return '\n'.join(full_text) # Join paragraphs with newline
except Exception as e:
print(f"Error processing DOCX {docx_path}: {e}")
return ""
def extract_text_from_file(file_path, lang='eng'):
"""
Extracts text from a file, automatically determining the file type.
Args:
file_path (str): The path to the file.
lang (str): Language code for OCR (used for images only).
Returns:
str: The extracted text. Returns an empty string if file type not supported or error occurs.
"""
file_extension = os.path.splitext(file_path)[1].lower()
if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
return extract_text_from_image(file_path, lang=lang)
elif file_extension == '.pdf':
return extract_text_from_pdf(file_path)
elif file_extension == '.docx':
return extract_text_from_docx(file_path)
elif file_extension == '.txt':
try:
with open(file_path, 'r', encoding='utf-8') as f:
return f.read()
except Exception as e:
print(f"Error reading text file {file_path}: {e}")
return ""
else:
print(f"Unsupported file type: {file_extension}")
return ""
def main():
"""
Main function to demonstrate the text extraction tool. Gets the filename from user.
"""
file_path = input("Enter the path to the file: ")
if not os.path.exists(file_path):
print(f"Error: File not found at path: {file_path}")
return
extracted_text = extract_text_from_file(file_path)
if extracted_text:
print("\nExtracted Text:\n")
print(extracted_text)
else:
print("No text extracted or an error occurred.")
if __name__ == "__main__":
main()
```
👁️ Viewed: 11
Comments