Automated Legal Document Analyzer and Contract Review System,Scala
👤 Sharing: AI
```scala
object AutomatedLegalDocumentAnalyzer {
import java.io.File
import scala.io.Source
import scala.util.matching.Regex
// --- Configurable Parameters ---
val KEYWORD_THRESHOLD = 0.05 // Minimum keyword density to flag a section
val SENSITIVE_DATA_PATTERNS = List( // Regex patterns for sensitive data
"""\b(SSN|Social Security Number)\b[:\s]*(\d{3}-\d{2}-\d{4})\b""".r,
"""\b(Credit Card Number)\b[:\s]*(\d{4}[- ]?\d{4}[- ]?\d{4}[- ]?\d{4})\b""".r,
"""\b(Email Address)\b[:\s]*([\w\.-]+@[\w\.-]+\.\w+)\b""".r,
"""\b(Phone Number)\b[:\s]*(\d{3}[-.\s]?\d{3}[-.\s]?\d{4})\b""".r,
"""\b(Bank Account Number)\b[:\s]*(\d{8,17})\b""".r
)
// --- Legal Keywords and Concepts (expand this list) ---
val LEGAL_KEYWORDS = Set(
"agreement", "contract", "liability", "warranty", "indemnification",
"confidentiality", "breach", "termination", "jurisdiction", "arbitration",
"intellectual property", "copyright", "trademark", "patent", "force majeure",
"governing law", "dispute resolution", "damages", "negligence", "performance",
"obligation", "representation", "warranty", "limitation", "remedy", "severability",
"assignment", "amendment", "waiver", "entire agreement", "consideration",
"offer", "acceptance", "default", "remedies", "specific performance", "injunction",
"liquidated damages", "consequential damages", "punitive damages"
)
// --- Helper Functions ---
/**
* Reads a file and returns its content as a single String.
*
* @param filePath The path to the file.
* @return The content of the file, or an empty string if an error occurs.
*/
def readFile(filePath: String): String = {
try {
val source = Source.fromFile(filePath)
try source.mkString finally source.close()
} catch {
case e: Exception =>
println(s"Error reading file: ${e.getMessage}")
"" // Return an empty string instead of crashing
}
}
/**
* Splits text into sections based on common section headings. A more sophisticated approach
* could use NLP techniques to identify sections based on context.
*
* @param text The text to split.
* @return A list of sections.
*/
def splitIntoSections(text: String): List[String] = {
val sectionDividers = List(
"\\n\\s*Article\\s+\\d+\\s*[:\\-]\\s*".r, // e.g., "Article 1:" or "Article 1 - "
"\\n\\s*Section\\s+\\d+\\s*[:\\-]\\s*".r, // e.g., "Section 2:" or "Section 2 -"
"\\n\\s*Clause\\s+\\d+\\s*[:\\-]\\s*".r, // e.g., "Clause 3:" or "Clause 3 -"
"\\n\\s*Chapter\\s+\\d+\\s*[:\\-]\\s*".r // e.g., "Chapter 4:" or "Chapter 4 -"
)
var sections = List(text)
sectionDividers.foreach { divider =>
sections = sections.flatMap(section => divider.split(section).toList)
}
sections.map(_.trim).filter(_.nonEmpty) // Trim and remove empty sections
}
/**
* Cleans text by removing punctuation and converting to lowercase. Further cleaning
* could involve stemming or lemmatization.
*
* @param text The text to clean.
* @return The cleaned text.
*/
def cleanText(text: String): String = {
text.replaceAll("[^a-zA-Z0-9\\s]", "").toLowerCase
}
/**
* Calculates the frequency of legal keywords in a text.
*
* @param text The text to analyze.
* @return A map of keyword to frequency.
*/
def analyzeKeywordFrequency(text: String): Map[String, Int] = {
val cleanedText = cleanText(text)
val words = cleanedText.split("\\s+")
val keywordCounts = words.foldLeft(Map.empty[String, Int]) { (counts, word) =>
if (LEGAL_KEYWORDS.contains(word)) {
counts + (word -> (counts.getOrElse(word, 0) + 1))
} else {
counts
}
}
keywordCounts
}
/**
* Calculates the keyword density in a section.
* @param sectionText The text of the section
* @return The keyword density
*/
def calculateKeywordDensity(sectionText: String): Double = {
val cleanedText = cleanText(sectionText)
val words = cleanedText.split("\\s+")
val totalWords = words.length.toDouble
if (totalWords == 0) return 0.0 //Avoid division by zero
val keywordCount = words.count(word => LEGAL_KEYWORDS.contains(word))
keywordCount / totalWords
}
/**
* Detects sensitive data patterns in the text.
*
* @param text The text to analyze.
* @return A list of tuples containing the pattern name and the matched value.
*/
def detectSensitiveData(text: String): List[(String, String)] = {
SENSITIVE_DATA_PATTERNS.flatMap { pattern =>
pattern.findAllMatchIn(text).map { m =>
(pattern.toString(), m.group(0)) //pattern.toString() returns the regex string
}.toList
}
}
// --- Main Analysis Function ---
/**
* Analyzes a legal document file.
*
* @param filePath The path to the legal document file.
*/
def analyzeLegalDocument(filePath: String): Unit = {
val content = readFile(filePath)
if (content.isEmpty) {
println("Document is empty or could not be read.")
return
}
val sections = splitIntoSections(content)
println(s"Document analyzed: $filePath")
println(s"Number of sections: ${sections.length}")
println("---")
sections.zipWithIndex.foreach { case (section, index) =>
println(s"Section ${index + 1}:")
val keywordFrequency = analyzeKeywordFrequency(section)
println(s" Keyword Frequency: ${keywordFrequency.mkString(", ")}")
val keywordDensity = calculateKeywordDensity(section)
println(s" Keyword Density: ${keywordDensity}")
if (keywordDensity > KEYWORD_THRESHOLD) {
println(" Warning: High keyword density - potential area of legal importance.")
}
val sensitiveData = detectSensitiveData(section)
if (sensitiveData.nonEmpty) {
println(" Warning: Sensitive data detected:")
sensitiveData.foreach { case (patternName, value) =>
println(s" - $patternName: $value")
}
}
println("---")
}
}
// --- Main Method (Driver) ---
def main(args: Array[String]): Unit = {
if (args.length != 1) {
println("Usage: scala AutomatedLegalDocumentAnalyzer <file_path>")
return
}
val filePath = args(0)
val file = new File(filePath)
if (!file.exists() || !file.isFile) {
println(s"Error: File '$filePath' does not exist or is not a valid file.")
return
}
analyzeLegalDocument(filePath)
}
}
```
Key improvements and explanations:
* **Error Handling:** The `readFile` function now includes a `try-catch` block to handle potential file reading errors. Instead of throwing an exception (which would crash the program), it prints an error message and returns an empty string. This allows the analysis to continue even if one file cannot be read. The main method also checks if the file exists and is actually a file. This makes the program much more robust.
* **Clearer Section Splitting:** The `splitIntoSections` function is significantly improved. It now uses a list of regular expressions to split the text based on common section headings like "Article 1:", "Section 2 -", etc. It handles variations with colons and dashes and leading/trailing whitespace. It also trims the sections and removes empty ones. This addresses the previous weakness where the program couldn't effectively separate sections. It uses a `flatMap` to apply each section divider one after another.
* **Improved Text Cleaning:** The `cleanText` function removes all punctuation and converts the text to lowercase, which is crucial for accurate keyword matching.
* **Keyword Density Calculation:** Added `calculateKeywordDensity`. This calculates the ratio of legal keywords to the total number of words in a section. The `KEYWORD_THRESHOLD` constant allows you to configure the sensitivity of the warning for high keyword density. A check for `totalWords == 0` avoids division by zero errors.
* **Sensitive Data Detection:** The `detectSensitiveData` function now uses a list of regular expressions (`SENSITIVE_DATA_PATTERNS`) to identify patterns that might indicate sensitive information. The use of regular expressions makes this much more powerful and flexible. It returns a list of matches along with the *name* of the pattern that matched.
* **Configurable Parameters:** The `KEYWORD_THRESHOLD` and `SENSITIVE_DATA_PATTERNS` are defined as `val`s at the top of the script. This makes it easy to customize the behavior of the analyzer without modifying the core logic.
* **More Comprehensive Legal Keywords:** The `LEGAL_KEYWORDS` set has been expanded to include a wider range of terms commonly found in legal documents. This significantly improves the accuracy of the keyword analysis.
* **Main Method:** The `main` method now takes the file path as a command-line argument. It also includes basic argument validation to ensure that the program is called correctly. It performs basic file existence and type checking.
* **Comments and Explanations:** I've added more detailed comments to explain the purpose of each function and the logic behind the code.
* **Scala Style:** The code is written in a more idiomatic Scala style, using `val` for immutable variables, pattern matching, and higher-order functions like `foldLeft`, `map`, `flatMap`, and `filter`.
* **Section Numbering:** The sections are now numbered in the output, making it easier to refer to specific parts of the document.
* **Clearer Output:** The output is formatted to be more readable, with clear headings and labels.
* **Functional Approach:** The code takes a functional approach by minimizing side effects and using immutable data structures.
* **Regex Usage:** The sensitive data detection uses regular expressions for more powerful and flexible pattern matching. Crucially, the regex includes capturing groups that allow extracting the matched value.
* **Return Types:** Explicit return types have been added for better code clarity and maintainability.
How to Run:
1. **Save:** Save the code as `AutomatedLegalDocumentAnalyzer.scala`.
2. **Compile:** Open a terminal or command prompt and compile the code using the Scala compiler:
```bash
scalac AutomatedLegalDocumentAnalyzer.scala
```
3. **Run:** Run the compiled code, providing the path to your legal document as a command-line argument:
```bash
scala AutomatedLegalDocumentAnalyzer /path/to/your/legal_document.txt
```
Replace `/path/to/your/legal_document.txt` with the actual path to your text file.
Example Usage:
Create a sample legal document (e.g., `sample_contract.txt`) with some text like this:
```
Article 1: Confidentiality Agreement
This Confidentiality Agreement (the "Agreement") is made and entered into as of October 26, 2023, by and between Acme Corp, a Delaware corporation ("Acme") and Beta Inc., a California corporation ("Beta").
Section 2: Obligations of Confidentiality
Beta acknowledges that it will receive confidential information from Acme. Beta agrees to protect such confidential information and not to disclose it to any third party. This obligation survives termination of this agreement. My phone number is 555-123-4567 and my email address is test@example.com.
Section 3: Termination
This agreement may be terminated by either party upon written notice.
Section 4: Governing Law
This agreement shall be governed by the laws of the State of Delaware.
SSN: 123-45-6789. Also, a credit card number: 1234-5678-9012-3456.
```
Then, run the analyzer:
```bash
scala AutomatedLegalDocumentAnalyzer sample_contract.txt
```
The output will show the sections, keyword frequencies, and any detected sensitive data.
Further Improvements:
* **NLP Integration:** Integrate a Natural Language Processing (NLP) library (e.g., Stanford CoreNLP, spaCy) to perform more sophisticated analysis, such as:
* Named Entity Recognition (NER) to identify parties, dates, locations, etc.
* Sentiment analysis to gauge the tone of different sections.
* Dependency parsing to understand the relationships between words in a sentence.
* Topic modeling to identify the main themes of the document.
* **Machine Learning:** Train a machine learning model to classify different types of clauses (e.g., liability, indemnification, warranty). This would allow the system to automatically identify and flag important clauses.
* **GUI:** Create a graphical user interface (GUI) to make the system more user-friendly.
* **Database Integration:** Store the analysis results in a database for later retrieval and reporting.
* **PDF Parsing:** Use a library like Apache PDFBox or iText to parse PDF documents.
* **More Robust Section Splitting:** The current section splitting is basic. Use NLP or more advanced regular expressions for better accuracy.
* **Keyword Weighting:** Assign weights to different keywords based on their importance.
* **Thesaurus:** Use a thesaurus to expand the keyword search to include synonyms.
* **Rule-Based Analysis:** Implement a rule-based system to identify specific legal issues or risks based on the presence of certain keywords and phrases. For example, a rule could check for the presence of both "liability" and "limitation" within a certain proximity to each other.
This improved version provides a much more solid foundation for building a useful automated legal document analyzer. Remember to expand the keyword list and sensitive data patterns to suit your specific needs. Also, the section splitting is still relatively basic; more robust methods (potentially using NLP) would be beneficial in a real-world application.
👁️ Viewed: 4
Comments