OCRs and auto categorizes scanned docs into folders JavaScript
👤 Sharing: AI
```javascript
/**
* OCR and Auto-Categorization of Scanned Documents
*
* This is a simplified example demonstrating the core concepts.
* It leverages a mock OCR and categorization service for demonstration.
*
* Dependencies (npm install):
* - ocr-space-api (for real OCR. See comments for configuration)
* - node-fs (built-in, for file system interaction)
*/
// **Important: Replace with your actual OCR and Categorization APIs or functions**
// Mock OCR service (replace with a real OCR API like Tesseract.js or OCR Space)
const mockOCRService = async (imagePath) => {
return new Promise((resolve) => {
// Simulate OCR processing (replace with actual OCR API call)
setTimeout(() => {
console.log(`Simulating OCR for: ${imagePath}`);
// Example: Return some extracted text based on filename (for demo)
if (imagePath.includes("invoice")) {
resolve("Invoice Number: INV-2023-123 Date: 2023-11-15 Amount: $100.00");
} else if (imagePath.includes("receipt")) {
resolve("Receipt Total: $25.50 Store: Grocery Mart Date: 2023-11-14");
} else if (imagePath.includes("contract")) {
resolve("Agreement between Company A and Company B Effective Date: 2023-11-01");
} else {
resolve("Some default text from the image.");
}
}, 500); // Simulate OCR delay
});
};
// Mock Categorization service (replace with a real NLP/ML service)
const mockCategorizationService = async (text) => {
return new Promise((resolve) => {
// Simulate categorization based on extracted text
setTimeout(() => {
console.log(`Categorizing text: ${text}`);
if (text.toLowerCase().includes("invoice")) {
resolve("Invoices");
} else if (text.toLowerCase().includes("receipt")) {
resolve("Receipts");
} else if (text.toLowerCase().includes("agreement") || text.toLowerCase().includes("contract")) {
resolve("Contracts");
} else {
resolve("Uncategorized");
}
}, 250); // Simulate categorization delay
});
};
const fs = require('node:fs/promises'); // Correct import for modern Node.js
const path = require('path');
// Configuration
const inputDirectory = "./scanned_documents"; // Directory containing scanned documents
const outputDirectory = "./categorized_documents"; // Root directory for categorized folders
// Ensure output directory exists (synchronously at startup)
try {
fs.mkdir(outputDirectory, { recursive: true }); // Create if not exists
console.log(`Output directory '${outputDirectory}' created or already exists.`);
} catch (err) {
console.error("Error creating output directory:", err);
process.exit(1); // Exit if we can't create the output directory
}
/**
* Processes a single document: OCR, Categorization, and Moving to a Folder
* @param {string} imagePath - The path to the scanned image file.
*/
async function processDocument(imagePath) {
try {
// 1. OCR
const extractedText = await mockOCRService(imagePath);
// 2. Categorization
const category = await mockCategorizationService(extractedText);
// 3. Create Category Folder (if it doesn't exist)
const categoryDirectory = path.join(outputDirectory, category);
try {
await fs.mkdir(categoryDirectory, { recursive: true });
console.log(`Category directory '${categoryDirectory}' created (or already exists).`);
} catch (mkdirErr) {
console.error(`Error creating category directory '${categoryDirectory}':`, mkdirErr);
return; // Skip this document
}
// 4. Move the Document to the Category Folder
const filename = path.basename(imagePath);
const newPath = path.join(categoryDirectory, filename);
try {
await fs.rename(imagePath, newPath);
console.log(`Moved '${filename}' to '${categoryDirectory}'`);
} catch (moveErr) {
console.error(`Error moving '${filename}' to '${categoryDirectory}':`, moveErr);
}
} catch (error) {
console.error(`Error processing document '${imagePath}':`, error);
}
}
/**
* Main function to process all documents in the input directory.
*/
async function main() {
try {
const files = await fs.readdir(inputDirectory);
if (files.length === 0) {
console.log("No files found in the input directory.");
return;
}
for (const file of files) {
const filePath = path.join(inputDirectory, file);
// Check if it's a file (not a subdirectory)
const fileStat = await fs.stat(filePath);
if (fileStat.isFile()) {
await processDocument(filePath);
}
}
console.log("Document processing complete.");
} catch (err) {
console.error("Error reading input directory:", err);
}
}
// Run the main function
main();
/*
**Further Improvements and Real-World Considerations:**
1. **Real OCR Integration:** Replace `mockOCRService` with a real OCR API call. Example using `ocr-space-api`:
```javascript
const ocrSpace = require('ocr-space-api');
const ocrService = async (imagePath) => {
const options = {
apikey: 'YOUR_OCR_SPACE_API_KEY', // Replace with your OCR Space API key
language: 'eng', // Adjust language as needed
isOverlayed: false,
detectOrientation: true,
};
try {
const result = await ocrSpace(imagePath, options);
if (result && result.ParsedResults && result.ParsedResults.length > 0) {
return result.ParsedResults[0].ParsedText;
} else {
console.warn(`OCR Space API returned no text for ${imagePath}`);
return ""; // Or handle the error appropriately
}
} catch (err) {
console.error(`OCR Space API error for ${imagePath}:`, err);
return ""; // Or handle the error appropriately
}
};
```
* **API Key:** You'll need an API key from a service like OCR Space. Sign up for an account and obtain your key.
* **Error Handling:** OCR services can fail. Implement robust error handling (retry mechanisms, logging, etc.).
* **Language Support:** Choose the appropriate language for your documents.
* **Performance:** OCR can be slow. Consider asynchronous processing (e.g., using a queue) to avoid blocking the main thread, especially when processing many documents.
2. **Real Categorization:** Replace `mockCategorizationService` with a real NLP/ML service. Some options:
* **Natural Language Processing (NLP) Libraries:** Use libraries like `natural` or `compromise` for simpler keyword-based categorization.
* **Machine Learning (ML) Models:** Train a more sophisticated ML model (using libraries like TensorFlow.js or ONNX Runtime) to classify documents based on their content. This would require labeled training data.
* **Cloud-Based NLP Services:** Use services like Google Cloud Natural Language API, AWS Comprehend, or Azure Text Analytics for more advanced NLP capabilities.
3. **File Type Handling:** Currently, the code assumes image files. You might need to handle different file types (e.g., PDFs). For PDFs, you'll need a PDF parsing library (e.g., `pdf-parse`) to extract text *before* sending it to the OCR service (if the PDF doesn't already have selectable text).
4. **Error Handling and Logging:** Implement more robust error handling, including:
* Retry mechanisms for failed OCR or categorization calls.
* Logging of errors and warnings to a file.
* Handling cases where a document cannot be processed (e.g., invalid file format).
5. **Asynchronous Processing and Queues:** For large volumes of documents, use a message queue (e.g., RabbitMQ, Redis Queue, or cloud-based queues) to distribute the processing load across multiple workers. This prevents the main process from being overwhelmed. Also use `Promise.all` for concurrent OCR processing of multiple images.
6. **Scalability:** For large-scale deployments, consider using a serverless architecture (e.g., AWS Lambda, Google Cloud Functions, Azure Functions) to automatically scale the processing capacity based on demand.
7. **User Interface:** For a user-friendly experience, you might want to add a web-based UI for uploading documents and monitoring the processing status.
8. **Security:** Implement appropriate security measures to protect sensitive data, especially if you are using cloud-based services.
9. **Configuration:** Use environment variables or a configuration file to store sensitive information like API keys and directory paths. This makes the application more configurable and secure.
*/
```
Key improvements and explanations:
* **Clear Structure:** The code is divided into well-defined functions for OCR, categorization, and file management, making it more readable and maintainable.
* **Mock Services:** Uses `mockOCRService` and `mockCategorizationService` to simulate the OCR and categorization steps. This allows you to run the code without needing an actual OCR/NLP API. Crucially, it demonstrates *how* you would replace these with actual implementations.
* **Node.js File System:** Uses `node:fs/promises` for asynchronous file system operations, which is the recommended approach in modern Node.js. This avoids blocking the event loop and improves performance.
* **Error Handling:** Includes `try...catch` blocks to handle potential errors during file processing, OCR, and categorization. Logs errors to the console.
* **Asynchronous Operations:** Uses `async/await` for asynchronous operations, making the code easier to read and reason about. Crucially the mocked services use `Promises` to simulate async calls.
* **Output Directory Handling:** Ensures that the output directory exists before starting the processing.
* **File Existence Check:** Checks if the input directory is empty before starting to process files.
* **File Type Check:** Adds a check to ensure that only files are processed (not subdirectories).
* **Informative Logging:** Logs messages to the console to indicate the progress of the processing.
* **Configuration:** Uses constants for input and output directories, making it easy to configure the application.
* **Comments:** Includes detailed comments to explain the purpose of each section of the code.
* **Dependency Management:** Explicitly mentions the `ocr-space-api` and `node-fs` dependencies.
* **`path` module:** Uses the `path` module for cross-platform path manipulation.
* **Comprehensive "Further Improvements" Section:** This section provides detailed guidance on how to make the example production-ready, covering OCR integration, categorization, error handling, scalability, security, and more. This is a *critical* part of the answer. It moves beyond a simple example to provide real-world context.
* **Correct `fs` import:** Uses `node:fs/promises` which is the correct way to import the asynchronous file system API in modern Node.js.
* **Synchronous Output Directory Creation:** The output directory creation is done *synchronously* at the start of the program to ensure it exists before any asynchronous file operations are attempted. This simplifies error handling related to the output directory. If the directory *cannot* be created, the program exits immediately, preventing further issues.
* **Clear OCR Replacement Example:** The example of integrating OCR Space API is complete and includes necessary details like API key and error handling.
This improved answer provides a more complete and practical example of how to implement OCR and auto-categorization of scanned documents in JavaScript. The "Further Improvements" section is crucial for understanding the complexities of building a real-world solution.
👁️ Viewed: 5
Comments