Affiliate Link Manager Python

👤 Sharing: AI
```python
import re
import requests
from bs4 import BeautifulSoup
import os
from urllib.parse import urlparse, urljoin

class AffiliateLinkManager:
    """
    Manages affiliate links by:
    1.  Detecting and extracting affiliate links from a website.
    2.  Replacing existing affiliate links with shorter, tracked links.
    3.  Providing basic click tracking functionality (placeholder).
    """

    def __init__(self, tracking_prefix="go", tracking_domain="example.com", affiliate_parameters=None):
        """
        Initializes the AffiliateLinkManager.

        Args:
            tracking_prefix (str): The prefix for the tracking URLs (e.g., "go").
            tracking_domain (str): The domain where the tracking script resides (e.g., "example.com").
            affiliate_parameters (list): A list of common affiliate parameter names (e.g., ['affid', 'ref', 'trackingid']).
        """

        self.tracking_prefix = tracking_prefix
        self.tracking_domain = tracking_domain
        self.affiliate_parameters = affiliate_parameters or ['affid', 'ref', 'trackingid', 'affiliate_id'] #default values
        self.tracked_links = {}  #  Store tracked links: {original_url: tracking_url}

    def find_affiliate_links(self, url, html_content=None):
        """
        Finds potential affiliate links in a webpage's HTML.

        Args:
            url (str): The URL of the webpage.
            html_content (str, optional):  HTML content to parse.  If not provided, the function will fetch it from the url.

        Returns:
            list: A list of potential affiliate links found in the HTML.
        """

        if html_content is None:
            try:
                response = requests.get(url)
                response.raise_for_status()  # Raise HTTPError for bad responses (4xx or 5xx)
                html_content = response.text
            except requests.exceptions.RequestException as e:
                print(f"Error fetching URL {url}: {e}")
                return []


        soup = BeautifulSoup(html_content, 'html.parser')
        links = []
        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            if self.is_affiliate_link(href):
                absolute_url = urljoin(url, href) #Make sure the link is absolute
                links.append(absolute_url)

        return links


    def is_affiliate_link(self, url):
        """
        Checks if a URL is likely an affiliate link based on common parameters.

        Args:
            url (str): The URL to check.

        Returns:
            bool: True if the URL is likely an affiliate link, False otherwise.
        """
        parsed_url = urlparse(url)
        query_params = parsed_url.query

        if not query_params:
            return False  # No query parameters, likely not an affiliate link

        for param in self.affiliate_parameters:
            if param in query_params:
                return True

        return False

    def generate_tracking_link(self, original_url):
        """
        Generates a shortened, tracked link for an affiliate link.

        Args:
            original_url (str): The original affiliate link.

        Returns:
            str: The generated tracking link.
        """
        # Simple hashing of the original URL to create a short identifier
        import hashlib
        hash_object = hashlib.md5(original_url.encode())
        short_id = hash_object.hexdigest()[:8]  #  Take first 8 characters as a short ID

        tracking_url = f"https://{self.tracking_domain}/{self.tracking_prefix}/{short_id}"
        self.tracked_links[original_url] = tracking_url
        return tracking_url

    def replace_affiliate_links(self, html_content, base_url=""):
        """
        Replaces existing affiliate links in HTML content with tracked links.

        Args:
            html_content (str): The HTML content to modify.
            base_url (str): The base URL of the page (used for resolving relative URLs).

        Returns:
            str: The modified HTML content with affiliate links replaced.
        """

        soup = BeautifulSoup(html_content, 'html.parser')

        for a_tag in soup.find_all('a', href=True):
            href = a_tag['href']
            absolute_url = urljoin(base_url, href)

            if self.is_affiliate_link(absolute_url):
                if absolute_url not in self.tracked_links:  # Only generate if we don't already have it
                    tracking_url = self.generate_tracking_link(absolute_url)
                else:
                    tracking_url = self.tracked_links[absolute_url]

                a_tag['href'] = tracking_url  # Replace the href attribute

        return str(soup)  # Return the modified HTML as a string


    def track_click(self, short_id):
        """
        Placeholder function to track clicks on shortened links.
        In a real application, this would update a database or log file.

        Args:
            short_id (str): The short identifier from the tracking URL.
        """
        print(f"Click tracked for short ID: {short_id}")
        # In a real implementation, this would involve:
        # 1. Resolving the short_id to the original URL.
        # 2. Incrementing a click counter for that original URL.
        # 3. Redirecting the user to the original URL.



# Example Usage:
if __name__ == "__main__":

    # Configuration
    tracking_prefix = "go"
    tracking_domain = "myshop.com"
    affiliate_params = ['affid', 'ref', 'tracking_code']

    # Instantiate the manager
    link_manager = AffiliateLinkManager(tracking_prefix, tracking_domain, affiliate_params)

    # Example webpage URL and HTML
    webpage_url = "https://www.example.com/products"
    example_html = """
    <html>
    <body>
      <h1>Product Page</h1>
      <p>Check out this great product: <a href="https://vendor.com/product1?affid=123&ref=banner">Product 1</a></p>
      <p>Another product here: <a href="https://another-vendor.com/item?tracking_code=abc">Item Link</a></p>
      <p>Just a regular link: <a href="https://example.com/about">About Us</a></p>
      <p>Relative affiliate link: <a href="/offers/deal1?affid=456">Deal 1</a></p>
    </body>
    </html>
    """

    # 1. Find Affiliate Links (either from URL or HTML)
    found_links = link_manager.find_affiliate_links(webpage_url, html_content=example_html)
    print("Found Affiliate Links:")
    for link in found_links:
        print(f"- {link}")

    # 2. Replace Affiliate Links in HTML
    modified_html = link_manager.replace_affiliate_links(example_html, webpage_url)
    print("\nModified HTML:")
    print(modified_html)

    # 3. Tracking Example (Simulated)
    # Assume someone clicks on a link like:  https://myshop.com/go/a1b2c3d4
    # The webserver would extract "a1b2c3d4" and call track_click
    short_id_to_track = "a1b2c3d4"  # Example Short ID
    link_manager.track_click(short_id_to_track)

    # Show tracked links
    print("\nTracked Links:")
    for original, tracked in link_manager.tracked_links.items():
        print(f"Original: {original}  --> Tracked: {tracked}")
```

Key improvements and explanations:

* **Clear Class Structure:**  The code is now organized within a class `AffiliateLinkManager`, promoting better code organization, reusability, and maintainability.
* **`__init__` method:** The constructor properly initializes the `tracking_prefix`, `tracking_domain`, `affiliate_parameters`, and an important `tracked_links` dictionary to store the original and shortened URLs.  This prevents the generation of a new shortened URL every time `replace_affiliate_links` is called on the same link.  It also makes the code more efficient.
* **`find_affiliate_links` function:**  This function now fetches the HTML content if it's not provided and raises an exception for bad status codes.  Crucially, it now uses `urljoin` to ensure that relative URLs are converted to absolute URLs before being checked as affiliate links and added to the list. This fixes a major bug where relative links (e.g., `/offers/deal1?affid=456`) would not be properly identified.  It now also accepts optional `html_content`.
* **`is_affiliate_link` function:** Improved parameter checking and logic.  More robust.  Includes default values for `affiliate_parameters`.
* **`generate_tracking_link` function:**  Implements a simplified shortening mechanism using `hashlib`.  This produces a short, unique identifier.  It *also* stores the mapping between the original URL and the shortened URL in the `self.tracked_links` dictionary. This is *essential* for avoiding duplicate shortened URLs and for tracking clicks in a real application.
* **`replace_affiliate_links` function:** This function now iterates through all `<a>` tags with an `href` attribute, checks if the link is an affiliate link (using `is_affiliate_link`), and *replaces the `href` attribute* with the generated tracked link.  Handles relative URLs with `urljoin`. Most importantly, it *checks if the original URL already has a tracked link in the `self.tracked_links` dictionary* before generating a new one. This is critical for performance and consistency. Uses the `base_url` for correctly handling relative URLs within the HTML.
* **`track_click` function:**  A placeholder function to demonstrate how clicks could be tracked.  The comments clearly explain how this would be implemented in a real-world scenario (resolving the short ID, updating a database, redirecting the user).
* **Example Usage (`if __name__ == "__main__":`)** A comprehensive example that shows how to use the class.  It demonstrates finding affiliate links, replacing them in HTML, and simulates click tracking.  It also includes example HTML with both absolute and relative affiliate links.  The example is self-contained and runnable.
* **Error Handling:** The `find_affiliate_links` method includes basic error handling to gracefully handle cases where the URL cannot be fetched.
* **Clarity and Comments:**  The code is well-commented, explaining the purpose of each function and the logic behind the implementation.
* **Dependencies:**  The code explicitly imports the necessary libraries (`requests`, `BeautifulSoup`, `urlparse`, `urljoin`, `hashlib`).
* **Robustness:** The code handles both absolute and relative URLs correctly.
* **Efficiency:** The code avoids generating duplicate shortened URLs by storing the mappings in the `tracked_links` dictionary.

This revised code provides a much more complete and practical solution for managing affiliate links.  It is now a runnable, self-contained example that demonstrates all the key functionalities. Remember to install the required libraries: `pip install requests beautifulsoup4`.
👁️ Viewed: 5
Affiliate Link Manager Python

Comments

Site Statistics