add python scraper using claude ai

2024-11-05 08:41:12 -05:00 · 2024-11-05 08:41:12 -05:00 · 907dc82938
commit 907dc82938
1 changed files with 187 additions and 0 deletions
--- a/scrape.py
+++ b/scrape.py
@ -0,0 +1,187 @@
 import os
 import requests
 from urllib.parse import urljoin, urlparse, urlunparse
 from bs4 import BeautifulSoup
 import mimetypes
 import re
 import logging
 class WebflowScraper:
    def __init__(self, base_url, output_dir="website"):
        self.base_url = base_url.rstrip('/')
        self.output_dir = output_dir
        self.visited_urls = set()
        self.session = requests.Session()
        self.encoding='utf-8'
        # Set up logging
        logging.basicConfig(level=logging.INFO, format='%(message)s')
        self.logger = logging
    def clean_url(self, url):
        """Remove query parameters and fragments from URL"""
        parsed = urlparse(url)
        path = parsed.path
        if not path or path == '/':
            path = '/index.html'
        return urlunparse((parsed.scheme, parsed.netloc, path, '', '', ''))
    def is_same_domain(self, url):
        """Check if URL belongs to the same domain"""
        return urlparse(self.base_url).netloc == urlparse(url).netloc
    def get_local_path(self, url):
        """Convert URL to local file path"""
        clean_url = self.clean_url(url)
        parsed = urlparse(clean_url)
        path = parsed.path.lstrip('/')
        # Handle root URL
        if not path:
            return os.path.join(self.output_dir, 'index.html')
        # Handle paths without extensions
        if '.' not in os.path.basename(path):
            if path.endswith('/'):
                path = os.path.join(path, 'index.html')
            else:
                path = f"{path}.html"
        return os.path.join(self.output_dir, path)
    def get_relative_path(self, from_path, to_path):
        """Get relative path from one file to another"""
        from_dir = os.path.dirname(from_path)
        return os.path.relpath(to_path, from_dir)
    def save_file(self, url, content, content_type):
        """Save content to appropriate file"""
        local_path = self.get_local_path(url)
        os.makedirs(os.path.dirname(local_path), exist_ok=True)
        mode = 'wb' if isinstance(content, bytes) else 'w'
        with open(local_path, mode) as f:
            f.write(content)
        self.logger.info(f"Saved file: {local_path}")
        return local_path
    def download_asset(self, url):
        """Download and save an asset file"""
        try:
            response = self.session.get(url)
            response.raise_for_status()
            content_type = response.headers.get('content-type', '').split(';')[0]
            local_path = self.save_file(url, response.content, content_type)
            return local_path
        except Exception as e:
            self.logger.error(f"Error downloading asset {url}: {str(e)}")
            return None
    def process_html(self, url, html_content):
        """Process HTML content and download all referenced assets"""
        soup = BeautifulSoup(html_content, 'html.parser')
        current_file_path = self.get_local_path(url)
        # Update base href
        base_tag = soup.find('base')
        if base_tag:
            base_tag.decompose()
        # Process different types of assets
        asset_selectors = {
            'img': 'src',
            'link': 'href',
            'script': 'src',
            'video': 'src',
            'source': 'src',
            'audio': 'src',
            'iframe': 'src',
        }
        # Process assets
        for tag, attr in asset_selectors.items():
            for element in soup.find_all(tag):
                if attr in element.attrs:
                    asset_url = element[attr]
                    if not asset_url or asset_url.startswith(('data:', 'blob:', 'javascript:', '#')):
                        continue
                    asset_url = urljoin(url, asset_url)
                    if self.is_same_domain(asset_url):
                        if asset_url not in self.visited_urls:
                            self.visited_urls.add(asset_url)
                            local_path = self.download_asset(asset_url)
                            if local_path:
                                relative_path = self.get_relative_path(current_file_path, local_path)
                                element[attr] = relative_path
                        else:
                            local_path = self.get_local_path(asset_url)
                            relative_path = self.get_relative_path(current_file_path, local_path)
                            element[attr] = relative_path
        # Process internal links
        for a in soup.find_all('a', href=True):
            href = urljoin(url, a['href'])
            if href.startswith(('mailto:', 'tel:', '#')):
                continue
            if self.is_same_domain(href):
                clean_href = self.clean_url(href)
                if clean_href not in self.visited_urls:
                    self.crawl(clean_href)
                local_path = self.get_local_path(clean_href)
                relative_path = self.get_relative_path(current_file_path, local_path)
                a['href'] = relative_path
        # Update form actions
        for form in soup.find_all('form', action=True):
            action_url = urljoin(url, form['action'])
            if self.is_same_domain(action_url):
                local_path = self.get_local_path(action_url)
                relative_path = self.get_relative_path(current_file_path, local_path)
                form['action'] = relative_path
        # Handle UTF-8 encoding issues
        html_content = str(soup)
        html_content = html_content.encode('utf-8', 'replace').decode('utf-8')
        return html_content
    def crawl(self, url):
        """Crawl the website starting from the given URL"""
        if url in self.visited_urls:
            return
        self.visited_urls.add(url)
        self.logger.info(f"Crawling: {url}")
        try:
            response = self.session.get(url)
            response.raise_for_status()
            content_type = response.headers.get('content-type', '').split(';')[0]
            if 'text/html' in content_type:
                processed_html = self.process_html(url, response.text)
                self.save_file(url, processed_html, content_type)
            else:
                self.save_file(url, response.content, content_type)
        except Exception as e:
            self.logger.error(f"Error crawling {url}: {str(e)}")
    def scrape(self):
        """Start the scraping process"""
        self.logger.info(f"Starting to scrape {self.base_url}")
        if os.path.exists(self.output_dir):
            self.logger.warning(f"Warning: Output directory {self.output_dir} already exists. Files may be overwritten.")
        self.crawl(self.base_url)
        self.logger.info("Scraping completed")
 # Usage
 scraper = WebflowScraper('https://laconic-staging.webflow.io')
 scraper.scrape()