Y-Automated Web Scraper & Data Analyzer Python GUI
👤 Sharing: AI
import tkinter as tk
from tkinter import ttk
from tkinter import scrolledtext
from tkinter import messagebox
import requests
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
import io
from urllib.parse import urlparse, urljoin
import re
import threading
import os
class WebScraperApp:
def __init__(self, master):
self.master = master
master.title("Y-Automated Web Scraper & Data Analyzer")
master.geometry("800x600")
# --- Input Frame ---
self.input_frame = ttk.LabelFrame(master, text="Scraping Configuration", padding=10)
self.input_frame.pack(fill=tk.X, padx=10, pady=10)
ttk.Label(self.input_frame, text="URL:").grid(row=0, column=0, sticky=tk.W, padx=5, pady=2)
self.url_entry = ttk.Entry(self.input_frame, width=50)
self.url_entry.grid(row=0, column=1, sticky=tk.EW, padx=5, pady=2)
ttk.Label(self.input_frame, text="CSS Selector:").grid(row=1, column=0, sticky=tk.W, padx=5, pady=2)
self.selector_entry = ttk.Entry(self.input_frame, width=50)
self.selector_entry.grid(row=1, column=1, sticky=tk.EW, padx=5, pady=2)
ttk.Label(self.input_frame, text="Data Type (text, href, etc.):").grid(row=2, column=0, sticky=tk.W, padx=5, pady=2)
self.data_type_entry = ttk.Entry(self.input_frame, width=50)
self.data_type_entry.grid(row=2, column=1, sticky=tk.EW, padx=5, pady=2)
ttk.Label(self.input_frame, text="Output File (CSV):").grid(row=3, column=0, sticky=tk.W, padx=5, pady=2)
self.output_file_entry = ttk.Entry(self.input_frame, width=50)
self.output_file_entry.grid(row=3, column=1, sticky=tk.EW, padx=5, pady=2)
self.output_file_entry.insert(0, "output.csv") # Default filename
ttk.Label(self.input_frame, text="Max Pages to Crawl (recursive):").grid(row=4, column=0, sticky=tk.W, padx=5, pady=2)
self.max_pages_entry = ttk.Entry(self.input_frame, width=10)
self.max_pages_entry.grid(row=4, column=1, sticky=tk.W, padx=5, pady=2)
self.max_pages_entry.insert(0, "1") # default value of 1
self.scrape_button = ttk.Button(self.input_frame, text="Scrape", command=self.start_scraping, width=20)
self.scrape_button.grid(row=5, column=1, sticky=tk.E, padx=5, pady=10)
# --- Output Frame ---
self.output_frame = ttk.LabelFrame(master, text="Scraping Results", padding=10)
self.output_frame.pack(fill=tk.BOTH, expand=True, padx=10, pady=10)
self.output_text = scrolledtext.ScrolledText(self.output_frame, wrap=tk.WORD)
self.output_text.pack(fill=tk.BOTH, expand=True)
# --- Analysis Frame ---
self.analysis_frame = ttk.LabelFrame(master, text="Data Analysis", padding=10)
self.analysis_frame.pack(fill=tk.X, padx=10, pady=10)
self.analyze_button = ttk.Button(self.analysis_frame, text="Analyze Data", command=self.analyze_data, width=20)
self.analyze_button.pack(side=tk.LEFT, padx=5, pady=10)
self.plot_frame = tk.Frame(self.analysis_frame)
self.plot_frame.pack(side=tk.RIGHT, fill=tk.BOTH, expand=True)
# --- Details Button ---
self.details_button = ttk.Button(master, text="Details", command=self.show_details, width=10)
self.details_button.pack(side=tk.BOTTOM, padx=5, pady=5)
self.scraped_data = [] # Store scraped data
def start_scraping(self):
url = self.url_entry.get()
selector = self.selector_entry.get()
data_type = self.data_type_entry.get()
output_file = self.output_file_entry.get()
try:
max_pages = int(self.max_pages_entry.get())
except ValueError:
messagebox.showerror("Error", "Invalid Max Pages value. Must be an integer.")
return
self.output_text.delete("1.0", tk.END)
self.scraped_data = []
# Start scraping in a separate thread to prevent UI blocking
threading.Thread(target=self.scrape_website, args=(url, selector, data_type, output_file, max_pages)).start()
def scrape_website(self, url, selector, data_type, output_file, max_pages):
self.crawl_page(url, selector, data_type, output_file, max_pages, visited=set())
self.display_message("Scraping complete! Data saved to " + output_file)
def crawl_page(self, url, selector, data_type, output_file, max_pages, visited):
if url in visited:
return # Avoid revisiting the same page
visited.add(url)
try:
response = requests.get(url)
response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
soup = BeautifulSoup(response.content, "html.parser")
elements = soup.select(selector)
for element in elements:
if data_type == "text":
data = element.get_text(strip=True)
elif data_type == "href":
data = element.get("href")
if data: # Handle relative URLs correctly
data = urljoin(url, data)
else:
data = element.get(data_type)
if data:
self.scraped_data.append(data)
self.display_message(f"Scraped: {data}\n")
# Recursive crawling (find links on the page and crawl them)
if max_pages > 1:
links = self.find_links(soup, url)
for link in links:
if len(visited) >= max_pages: # Limit the number of crawled pages
break
self.crawl_page(link, selector, data_type, output_file, max_pages - 1, visited)
except requests.exceptions.RequestException as e:
self.display_message(f"Error scraping {url}: {e}\n")
except Exception as e:
self.display_message(f"An unexpected error occurred while scraping {url}: {e}\n")
# Save data to CSV after each page crawl
self.save_to_csv(output_file)
def find_links(self, soup, base_url):
links = []
for a_tag in soup.find_all("a", href=True):
link = a_tag['href']
absolute_url = urljoin(base_url, link) # Ensure absolute URLs
if urlparse(absolute_url).netloc == urlparse(base_url).netloc: # Only crawl within the same domain
links.append(absolute_url)
return links
def save_to_csv(self, output_file):
try:
df = pd.DataFrame(self.scraped_data, columns=["Data"]) # Use a DataFrame
df.to_csv(output_file, index=False, encoding='utf-8') # Specify encoding
except Exception as e:
self.display_message(f"Error saving to CSV: {e}\n")
def analyze_data(self):
if not self.scraped_data:
messagebox.showinfo("Info", "No data to analyze. Please scrape first.")
return
# Simple word frequency analysis (example)
text = ' '.join(self.scraped_data)
words = re.findall(r'\b\w+\b', text.lower())
word_counts = pd.Series(words).value_counts().head(10)
# Create a bar plot
plt.clf() # Clear previous plot
fig, ax = plt.subplots(figsize=(6, 4))
word_counts.plot(kind='bar', ax=ax)
ax.set_title("Top 10 Word Frequencies")
ax.set_xlabel("Words")
ax.set_ylabel("Frequency")
plt.xticks(rotation=45, ha="right")
plt.tight_layout()
# Embed the plot in the Tkinter window
canvas = FigureCanvasTkAgg(fig, master=self.plot_frame)
canvas.draw()
canvas.get_tk_widget().pack(fill=tk.BOTH, expand=True)
def display_message(self, message):
self.output_text.insert(tk.END, message)
self.output_text.see(tk.END) # Autoscroll
def show_details(self):
details_text = (
"This program is an automated web scraper and data analyzer. It allows you to:
" +
"1. **Scrape data from websites:** Enter a URL, a CSS selector to identify the data you want to extract, and the type of data (text, href, or attribute).
" +
"2. **Recursive Crawling:** The program can recursively crawl through multiple pages on a website, following links it finds.
" +
"3. **Save data to CSV:** The extracted data is saved to a CSV file for further analysis.
" +
"4. **Analyze scraped data:** The program performs a simple word frequency analysis and displays the results in a bar chart. More sophisticated analysis can be added.
" +
"**How to Use:**
" +
"* Enter the URL of the website you want to scrape.
" +
"* Enter the CSS selector that identifies the data you want to extract. You can use your browser's developer tools to find the appropriate selector.
" +
"* Enter the data type (e.g., 'text' for the text content, 'href' for the URL of a link, or an attribute name).
" +
"* Enter the desired output filename (default is 'output.csv').
" +
"* Click the 'Scrape' button to start scraping. The scraped data will be displayed in the 'Scraping Results' area.
" +
"* Click the 'Analyze Data' button to perform a simple analysis of the scraped data. A bar chart showing the top 10 word frequencies will be displayed.
" +
"**Error Handling:**
" +
"The program includes error handling for common issues such as invalid URLs, network errors, and invalid CSS selectors. Error messages are displayed in the 'Scraping Results' area.
" +
"**Recursive Crawling Notes:**
" +
"* The 'Max Pages to Crawl' setting limits the number of pages the scraper will visit. This is important to prevent the scraper from running indefinitely.
" +
"* The scraper only crawls links within the same domain as the starting URL.
" +
"**Dependencies:**
" +
"* `tkinter` (for the GUI)
" +
"* `requests` (for making HTTP requests)
" +
"* `beautifulsoup4` (for parsing HTML)
" +
"* `pandas` (for data analysis and CSV saving)
" +
"* `matplotlib` (for plotting)
" +
"These libraries can be installed using pip: `pip install requests beautifulsoup4 pandas matplotlib`"
)
messagebox.showinfo("Details", details_text)
root = tk.Tk()
app = WebScraperApp(root)
root.mainloop()
👁️ Viewed: 3
Comments