Adaptive Data Harmonizer Python GUI

👤 Sharing: AI
import tkinter as tk
from tkinter import ttk
from tkinter import filedialog, messagebox
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
import seaborn as sns
import re

class AdaptiveDataHarmonizer:
    def __init__(self, master):
        self.master = master
        master.title("Adaptive Data Harmonizer")
        master.geometry("800x600")

        self.data = None
        self.file_path = None

        # Style
        self.style = ttk.Style()
        self.style.configure('TButton', padding=5, relief="raised")
        self.style.configure('TLabel', padding=5)
        self.style.configure('TEntry', padding=5)

        # UI Elements
        self.load_button = ttk.Button(master, text="Load Data", command=self.load_data)
        self.load_button.pack(pady=10)

        self.data_info_label = ttk.Label(master, text="No data loaded")
        self.data_info_label.pack()

        self.preprocess_button = ttk.Button(master, text="Preprocess Data", command=self.preprocess_data, state=tk.DISABLED)
        self.preprocess_button.pack(pady=10)

        self.visualize_button = ttk.Button(master, text="Visualize Data", command=self.visualize_data, state=tk.DISABLED)
        self.visualize_button.pack(pady=10)

        self.export_button = ttk.Button(master, text="Export Cleaned Data", command=self.export_data, state=tk.DISABLED)
        self.export_button.pack(pady=10)

        self.details_button = ttk.Button(master, text="Details", command=self.show_details)
        self.details_button.pack(pady=10)

        self.status_label = ttk.Label(master, text="Ready")
        self.status_label.pack(side=tk.BOTTOM, fill=tk.X)

        self.notebook = ttk.Notebook(master)
        self.notebook.pack(fill=tk.BOTH, expand=True)

        self.raw_data_tab = ttk.Frame(self.notebook)
        self.cleaned_data_tab = ttk.Frame(self.notebook)

        self.notebook.add(self.raw_data_tab, text="Raw Data")
        self.notebook.add(self.cleaned_data_tab, text="Cleaned Data")

        self.raw_data_tree = ttk.Treeview(self.raw_data_tab, show="headings")
        self.raw_data_tree.pack(fill=tk.BOTH, expand=True)

        self.cleaned_data_tree = ttk.Treeview(self.cleaned_data_tab, show="headings")
        self.cleaned_data_tree.pack(fill=tk.BOTH, expand=True)

    def load_data(self):
        self.file_path = filedialog.askopenfilename(filetypes=[("CSV files", "*.csv"), ("Excel files", "*.xlsx;*.xls")])
        if self.file_path:
            try:
                if self.file_path.endswith('.csv'):
                    self.data = pd.read_csv(self.file_path)
                else:
                    self.data = pd.read_excel(self.file_path)

                self.status_label.config(text=f"Loaded: {self.file_path}")
                self.data_info_label.config(text=f"Shape: {self.data.shape}\nColumns: {', '.join(self.data.columns)}")

                self.populate_treeview(self.raw_data_tree, self.data)

                self.preprocess_button.config(state=tk.NORMAL)
                self.visualize_button.config(state=tk.NORMAL)
                self.export_button.config(state=tk.NORMAL)

            except Exception as e:
                messagebox.showerror("Error", f"Failed to load data: {e}")
                self.status_label.config(text=f"Error loading data: {e}")

    def preprocess_data(self):
        if self.data is not None:
            try:
                # Data Cleaning Steps (Implement as needed)
                data = self.data.copy()
                # 1. Handle missing values (replace with mean for numerical, mode for categorical)
                for col in data.columns:
                    if data[col].isnull().any():
                        if pd.api.types.is_numeric_dtype(data[col]):
                            data[col] = data[col].fillna(data[col].mean())
                        else:
                            data[col] = data[col].fillna(data[col].mode()[0])  # Use mode for categorical

                # 2. Remove duplicate rows
                data = data.drop_duplicates()

                # 3. Standardize text data (lowercase, remove special characters)
                for col in data.select_dtypes(include=['object']).columns:
                    data[col] = data[col].str.lower().str.replace('[^a-zA-Z0-9\s]', '', regex=True)

                # 4. Outlier removal using IQR method (for numerical columns)
                for col in data.select_dtypes(include=np.number).columns:
                    Q1 = data[col].quantile(0.25)
                    Q3 = data[col].quantile(0.75)
                    IQR = Q3 - Q1
                    lower_bound = Q1 - 1.5 * IQR
                    upper_bound = Q3 + 1.5 * IQR
                    data = data[(data[col] >= lower_bound) & (data[col] <= upper_bound)]

                self.cleaned_data = data
                self.populate_treeview(self.cleaned_data_tree, self.cleaned_data)
                self.status_label.config(text="Data Preprocessed")

            except Exception as e:
                messagebox.showerror("Error", f"Failed to preprocess data: {e}")
                self.status_label.config(text=f"Error preprocessing data: {e}")

    def visualize_data(self):
        if self.data is not None:
            try:
                # Example Visualization: Pairplot
                plt.figure(figsize=(12, 10))
                sns.pairplot(self.data.select_dtypes(include=np.number))
                plt.suptitle('Pairplot of Numerical Features', y=1.02)
                plt.tight_layout()

                # Embed plot in Tkinter window
                fig = plt.gcf()
                popup = tk.Toplevel(self.master)
                popup.title("Data Visualization")
                canvas = FigureCanvasTkAgg(fig, master=popup)
                canvas.draw()
                canvas.get_tk_widget().pack(side=tk.TOP, fill=tk.BOTH, expand=1)

                self.status_label.config(text="Data Visualized")

            except Exception as e:
                messagebox.showerror("Error", f"Failed to visualize data: {e}")
                self.status_label.config(text=f"Error visualizing data: {e}")

    def export_data(self):
        if self.data is not None:
            try:
                file_path = filedialog.asksaveasfilename(defaultextension=".csv", filetypes=[("CSV file", "*.csv")])
                if file_path:
                    if hasattr(self, 'cleaned_data'):
                        self.cleaned_data.to_csv(file_path, index=False)
                    else:
                        self.data.to_csv(file_path, index=False)

                    self.status_label.config(text=f"Data exported to {file_path}")

            except Exception as e:
                messagebox.showerror("Error", f"Failed to export data: {e}")
                self.status_label.config(text=f"Error exporting data: {e}")

    def populate_treeview(self, tree, data):
        for item in tree.get_children():
            tree.delete(item)

        tree["columns"] = list(data.columns)
        tree["show"] = "headings"

        for column in tree["columns"]:
            tree.heading(column, text=column)
            tree.column(column, width=100)

        for index, row in data.iterrows():
            tree.insert("", "end", values=list(row))

    def show_details(self):
        details_window = tk.Toplevel(self.master)
        details_window.title("Program Details")

        details_text = tk.Text(details_window, wrap=tk.WORD)
        details_text.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)

        details_text.insert(tk.END, "Adaptive Data Harmonizer: A versatile tool designed to simplify and streamline the data cleaning and preparation process. It allows users to load data from CSV and Excel files, perform preprocessing steps (handling missing values, removing duplicates, standardizing text, and outlier removal), visualize data, and export the cleaned data.")
        details_text.insert(tk.END, "\n\nKey Features:\n")
        details_text.insert(tk.END, "- Data Loading: Supports CSV and Excel files.\n")
        details_text.insert(tk.END, "- Data Preprocessing: Includes handling of missing values, duplicate removal, text standardization, and outlier removal.\n")
        details_text.insert(tk.END, "- Data Visualization: Provides basic data visualization capabilities (e.g., pairplots) to understand data distribution.\n")
        details_text.insert(tk.END, "- Data Export: Exports cleaned data to CSV files.\n")
        details_text.insert(tk.END, "- User-Friendly Interface: Simple and intuitive GUI built with Tkinter.\n")
        details_text.config(state=tk.DISABLED)  # Make it read-only


root = tk.Tk()
app = AdaptiveDataHarmonizer(root)
root.mainloop()
👁️ Viewed: 8
Adaptive Data Harmonizer Python GUI

Comments

Site Statistics