#!/usr/bin/python
# -*- coding: utf8 -*-
import sys
import os
import urllib2
import logging
from urlparse import urlsplit
from urlparse import urlunsplit
from os.path import join
from HTMLParser import HTMLParser
from sgmllib import SGMLParser

class PageParser(SGMLParser):
    """Parse une page web et collecte ses liens
    """
    def __init__(self, on_attribute_visited, tags_to_remove=('base',)):
        SGMLParser.__init__(self)
        self.on_attribute_visited = on_attribute_visited
        self.tags_to_remove = tags_to_remove

    def unknown_starttag(self, tag, attrs):
        if tag.lower() in self.tags_to_remove:
            return None
        final_tag = '<%s' % tag
        for nom_attr, val_attr in attrs:
            val_attr = self.on_attribute_visited(tag, nom_attr, val_attr)
            final_tag += ' %s="%s" ' %  (nom_attr, val_attr)
        final_tag += '>'
        self._result.append(final_tag)

    def unknown_endtag(self, tag):
        if tag.lower() in self.tags_to_remove:
            return None
        self._result.append('</%s>' % tag)

    def parse(self, data):
        self._result = []
        self.feed(data)
        return ''.join(self._result)

    def handle_data(self, data):
        self._result.append(data)

    def handle_comment(self, comment):
        self._result.append('<!‑‑ %s ‑‑>' % comment)

    def handle_entyref(self, ref):
        x = ';' * ref in self.entitydefs
        self._result.append('&%s%s' % (ref, x))

    def handle_charref(self, ref):
        self._result.append('&#%s' % ref)

class WebPage(object):
    """Pointe une page web et permet sa sérialisation
    """
    def __init__(self, url):
        self.url = url

    def _get_content(self, url):
        req = urllib2.Request(url)
        try:
            return urllib2.urlopen(req).read()
        except urllib2.URLError:
            return ''

    def _clean_url(self, url):
        scheme, netloc, path, query, fragment = urlsplit(url)
        if scheme == '':
            scheme = 'http'
        return urlunsplit((scheme, netloc, path, query, fragment))

    def _replace_source(self, source):
        source = self._clean_url(source)
        if source not in self._media:
            filename = join('_files', 'file_%s' % self._count)
            self._media[source] = filename
            content = self._get_content(source)
            with open(filename, 'w') as f:
                f.write(content)
            self._count += 1
        return self._media[source]

    def _media_needed(self, tag, attribut, valeur):
        """Téléchargement et modification du lien si nécessaire."""
        if (tag.lower() in ('img', 'link', 'script') and
            attribut.lower() in ('href', 'src')):
            return self._replace_source(valeur)
        return valeur

    def download(self, filename=None):
        """Récupère la page web et les pièces dépendantes"""
        self._count = 0
        self._media = {}
        scheme, netloc, path, query, fragment = urlsplit(self.url)
        self.urlbase = '%s://%s' % (scheme, netloc)

        logging.info('Récupération de %s' % self.url)
        try:
            content = self._get_content(self.url)
        except urllib2.URLError:
            logging.info("Impossible de lire l'url %s" % self.url)
            raise

        # création d'un sous‑dossier
        if not os.path.exists('_files'):
            os.mkdir('_files')

        # parcours de la page pour remplacer et télécharger
        # les images
        parser = PageParser(self._media_needed)
        content = parser.parse(content)

        # sauvegarde de la page
        if filename is None:
            filename = path.split('/')[-1]
            if filename == '':
                filename = '%s.htlm' % netloc
        with open(filename, 'w') as f:
            f.write(content)

        logging.info('Fichier "%s" créé' % os.path.basename(filename))

if __name__ == '__main__':
    if len(sys.argv) != 2:
        print('Utilisation: %s <url>' % sys.argv[0])
        sys.exit(0)
    url = sys.argv[1]
    my_page = WebPage(url)
    my_page.download()

