"""
Web Crawler Module
Extracts pages from websites for analysis
"""

import logging
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse


class Crawler:
    """Web crawler for extracting pages from a website."""

    def __init__(self, company='', url=''):
        self.visited_urls = []
        self.urls_to_visit = [url]
        self.company = company
        self.domain = urlparse(url).netloc
        self.pages = []

    def download_url(self, url, content_type):
        if content_type is not None:
            if content_type.startswith('text/html'):
                return requests.get(url).text
            else:
                return requests.get(url).content

    def get_linked_urls(self, url, html):
        soup = BeautifulSoup(str(html), 'html.parser')
        for link in soup.find_all('a'):
            path = link.get('href')
            if path and path.startswith('/'):
                path = urljoin(url, path)
            yield path

    def add_url_to_visit(self, url):
        if url not in self.visited_urls and url not in self.urls_to_visit:
            self.urls_to_visit.append(url)

    def crawl(self, url):
        content_type = requests.head(url).headers.get('content-type')
        if content_type.startswith('text/html') or content_type.startswith('application/pdf'):
            logging.info(f'Crawling: {url}')
            body = self.download_url(url, content_type)

            self.pages.append(url)

            for url in self.get_linked_urls(url, body):
                url_domain = urlparse(url).netloc
                # Only add URLs in site domain
                if url is not None:
                    if (url_domain == self.domain or 'website-files.com' in url_domain) and 'mailto' not in url:
                        self.add_url_to_visit(url)
        else:
            logging.info(f'Not crawling {url} due to content type')

    def get_pages(self):
        return self.pages

    def run(self):
        while self.urls_to_visit:
            url = self.urls_to_visit.pop(0)
            try:
                self.crawl(url)
            except Exception:
                logging.exception(f'Failed to crawl: {url}')
            finally:
                self.visited_urls.append(url)
