Document Text Extraction Tool Python, OCR

👤 Sharing: AI
```python
import pytesseract
from PIL import Image
import os
import io
from pdfminer.high_level import extract_text as pdf_extract_text
from docx import Document

def extract_text_from_image(image_path, lang='eng'):
    """
    Extracts text from an image using OCR.

    Args:
        image_path (str): The path to the image file.
        lang (str): Language code for OCR (e.g., 'eng' for English).

    Returns:
        str: The extracted text.  Returns an empty string if there's an error.
    """
    try:
        img = Image.open(image_path)
        text = pytesseract.image_to_string(img, lang=lang)
        return text
    except Exception as e:
        print(f"Error processing image {image_path}: {e}")
        return ""


def extract_text_from_pdf(pdf_path):
    """
    Extracts text from a PDF file.

    Args:
        pdf_path (str): The path to the PDF file.

    Returns:
        str: The extracted text. Returns an empty string if there's an error.
    """
    try:
        text = pdf_extract_text(pdf_path)
        return text
    except Exception as e:
        print(f"Error processing PDF {pdf_path}: {e}")
        return ""


def extract_text_from_docx(docx_path):
    """
    Extracts text from a DOCX file.

    Args:
        docx_path (str): The path to the DOCX file.

    Returns:
        str: The extracted text.  Returns an empty string if there's an error.
    """
    try:
        doc = Document(docx_path)
        full_text = []
        for paragraph in doc.paragraphs:
            full_text.append(paragraph.text)
        return '\n'.join(full_text)  # Join paragraphs with newline
    except Exception as e:
        print(f"Error processing DOCX {docx_path}: {e}")
        return ""


def extract_text_from_file(file_path, lang='eng'):
    """
    Extracts text from a file, automatically determining the file type.

    Args:
        file_path (str): The path to the file.
        lang (str): Language code for OCR (used for images only).

    Returns:
        str: The extracted text.  Returns an empty string if file type not supported or error occurs.
    """
    file_extension = os.path.splitext(file_path)[1].lower()

    if file_extension in ['.jpg', '.jpeg', '.png', '.gif', '.bmp', '.tiff']:
        return extract_text_from_image(file_path, lang=lang)
    elif file_extension == '.pdf':
        return extract_text_from_pdf(file_path)
    elif file_extension == '.docx':
        return extract_text_from_docx(file_path)
    elif file_extension == '.txt':
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                return f.read()
        except Exception as e:
            print(f"Error reading text file {file_path}: {e}")
            return ""
    else:
        print(f"Unsupported file type: {file_extension}")
        return ""


def main():
    """
    Main function to demonstrate the text extraction tool.  Gets the filename from user.
    """
    file_path = input("Enter the path to the file: ")

    if not os.path.exists(file_path):
        print(f"Error: File not found at path: {file_path}")
        return

    extracted_text = extract_text_from_file(file_path)

    if extracted_text:
        print("\nExtracted Text:\n")
        print(extracted_text)
    else:
        print("No text extracted or an error occurred.")


if __name__ == "__main__":
    main()
```
👁️ Viewed: 11
Document Text Extraction Tool Python, OCR

Comments

Site Statistics