#!/usr/bin/python3
"""
PlanetFilter - filter for blog aggregators.

PlanetFilter uses a blacklist to filter a blog aggregator feed.
It allows anyone to subscribe to popular blog aggregators without
being overwhelmed by the noise.

Copyright (C) 2010, 2015  Francois Marier <francois@fmarier.org>

This program is free software: you can redistribute it and/or modify
it under the terms of the GNU Affero General Public License as
published by the Free Software Foundation, either version 3 of the
License, or (at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program.  If not, see <http://www.gnu.org/licenses/>.
"""

import argparse
import codecs
import configparser as cp
import defusedxml.minidom as minidom
import gzip
import html
import http.client
import io
import os
import os.path
import sys
import urllib.error
from urllib.parse import quote, urlsplit, urlunsplit
from urllib.request import Request, urlopen
from xml.dom.minidom import Node
import xml.parsers.expat

RDFNS = 'http://www.w3.org/1999/02/22-rdf-syntax-ns#'

VERSION = '0.7.0'


def delete_node(node):
    parent = node.parentNode
    parent.removeChild(node)


def delete_rss1_item(item):
    # Delete refernce to the item
    rdfabout = item.getAttributeNS(RDFNS, 'about')
    rdfnode = item.parentNode
    channel = rdfnode.getElementsByTagName('channel').item(0)
    rdfseq = channel.getElementsByTagNameNS(RDFNS, 'Seq').item(0)
    rdflist = rdfseq.getElementsByTagNameNS(RDFNS, 'li')
    # pylint: disable=invalid-name
    for li in rdflist:
        if li.getAttributeNS(RDFNS, 'resource') == rdfabout:
            delete_node(li)

    # Delete the item
    delete_node(item)


def is_rss2(xmldocument):
    rsslist = xmldocument.getElementsByTagName('rss')
    if rsslist.length != 1:
        return False
    else:
        # Check the version
        rss = rsslist.item(0)
        if rss.getAttribute('version') != '2.0':
            return False
        else:
            return True


def is_rss1(xmldocument):
    rdflist = xmldocument.getElementsByTagNameNS(RDFNS, 'RDF')
    if rdflist.length != 1:
        return False
    else:
        # Check the namespace/version
        rdf = rdflist.item(0)
        if rdf.getAttribute('xmlns').find('purl.org/rss/1.0') > -1:
            return True
        else:
            return False


def is_atom(xmldocument):
    feedlist = xmldocument.getElementsByTagName('feed')
    if feedlist.length != 1:
        return False
    else:
        # Check the namespace/version
        feed = feedlist.item(0)
        if feed.getAttribute('xmlns').find('w3.org/2005/Atom') > -1:
            return True
        else:
            return False


def filter_rss2(xmldocument, blacklist):
    # pylint: disable=too-many-branches,too-many-locals
    rss = xmldocument.getElementsByTagName('rss').item(0)
    channel = rss.getElementsByTagName('channel').item(0)
    items = channel.getElementsByTagName('item')
    for item in items:
        deleted = False
        titles = item.getElementsByTagName('title')
        if blacklist['authors'] or blacklist['titles']:
            for title in titles:
                textnode = title.firstChild
                if not textnode:
                    continue  # skip empty titles
                if textnode and Node.TEXT_NODE == textnode.nodeType:
                    titlestring = textnode.nodeValue.strip()
                    if blacklist['authors']:
                        for author in blacklist['authors']:
                            if 0 == titlestring.find(author):
                                delete_node(item)
                                deleted = True
                                break
                    if not deleted and blacklist['titles']:
                        for title in blacklist['titles']:
                            if titlestring.find(title) > -1:
                                delete_node(item)
                                deleted = True
                                break
                if deleted:
                    break

        if not deleted and blacklist['urls']:
            links = item.getElementsByTagName('link')
            for link in links:
                textnode = link.firstChild
                if Node.TEXT_NODE == textnode.nodeType:
                    linkstring = textnode.nodeValue.strip()
                    for url in blacklist['urls']:
                        if 0 == linkstring.find(url):
                            delete_node(item)
                            deleted = True
                            break
                if deleted:
                    break

    return True


def filter_atom(xmldocument, blacklist):
    # pylint: disable=too-many-branches,too-many-locals
    feed = xmldocument.getElementsByTagName('feed').item(0)
    entries = feed.getElementsByTagName('entry')
    for entry in entries:
        deleted = False
        if blacklist['authors']:
            authors = entry.getElementsByTagName('author')
            for author in authors:
                name = author.getElementsByTagName('name').item(0)
                textnode = name.firstChild
                if textnode and Node.TEXT_NODE == textnode.nodeType:
                    authorstring = textnode.nodeValue.strip()
                    for author in blacklist['authors']:
                        if 0 == authorstring.find(author):
                            delete_node(entry)
                            deleted = True
                            break
                if deleted:
                    break

        if not deleted and blacklist['titles']:
            titles = entry.getElementsByTagName('title')
            for title in titles:
                textnode = title.firstChild
                if not textnode:
                    continue  # skip empty titles
                if Node.TEXT_NODE == textnode.nodeType:
                    titlestring = textnode.nodeValue.strip()
                    for title in blacklist['titles']:
                        if 0 == titlestring.find(title):
                            delete_node(entry)
                            deleted = True
                            break
                if deleted:
                    break

        if not deleted and blacklist['urls']:
            links = entry.getElementsByTagName('link')
            for link in links:
                if link.getAttribute('rel') != 'alternate':
                    continue
                linkstring = link.getAttribute('href')
                for url in blacklist['urls']:
                    if 0 == linkstring.find(url):
                        delete_node(entry)
                        deleted = True
                        break
                if deleted:
                    break

    return True


def filter_rss1(xmldocument, blacklist):
    # pylint: disable=too-many-branches
    rdf = xmldocument.getElementsByTagNameNS(RDFNS, 'RDF').item(0)
    items = rdf.getElementsByTagName('item')
    for item in items:
        deleted = False
        titles = item.getElementsByTagName('title')
        if blacklist['authors'] or blacklist['titles']:
            for title in titles:
                textnode = title.firstChild
                if not textnode:
                    continue  # skip empty titles
                if textnode and Node.TEXT_NODE == textnode.nodeType:
                    titlestring = textnode.nodeValue.strip()
                    if blacklist['authors']:
                        for author in blacklist['authors']:
                            if 0 == titlestring.find(author):
                                delete_rss1_item(item)
                                deleted = True
                                break
                    if not deleted and blacklist['titles']:
                        for title in blacklist['titles']:
                            if titlestring.find(title) > -1:
                                delete_rss1_item(item)
                                deleted = True
                                break
                if deleted:
                    break

        if not deleted and blacklist['urls']:
            links = item.getElementsByTagName('link')
            for link in links:
                textnode = link.firstChild
                if Node.TEXT_NODE == textnode.nodeType:
                    linkstring = textnode.nodeValue.strip()
                    for url in blacklist['urls']:
                        if 0 == linkstring.find(url):
                            delete_rss1_item(item)
                            deleted = True
                            break
                if deleted:
                    break

    return True


def filter_feed(xmldocument, blacklist):
    if is_rss2(xmldocument):
        return filter_rss2(xmldocument, blacklist)
    elif is_rss1(xmldocument):
        return filter_rss1(xmldocument, blacklist)
    elif is_atom(xmldocument):
        return filter_atom(xmldocument, blacklist)
    else:
        print('Unsupported feed type', file=sys.stderr)
        return False


def read_config_url(config, configfile):
    try:
        url = config.get('feed', 'url')
    except cp.NoSectionError:
        print("Error: '%s' doesn't contain a [feed] section" % configfile,
              file=sys.stderr)
        return None
    except cp.NoOptionError:
        print("Error: '%s' doesn't contain a feed URL" % configfile,
              file=sys.stderr)
        return None
    if not url:
        print("Error: '%s' doesn't contain a feed URL" % configfile,
              file=sys.stderr)
        return None

    # URL-escape the path (bug 1485854)
    parts = urlsplit(url)
    parts = parts._replace(path=quote(parts.path))
    url = urlunsplit(parts)

    return url


def read_config_blacklist(config, configfile):
    blacklist = {'authors': None, 'titles': None, 'urls': None}

    try:
        # pylint: disable=no-member
        blacklist['authors'] = config.get('blacklist', 'authors').split("\n")
    except cp.NoSectionError:
        print("Warning: '%s' doesn't contain a [blacklist] section" %
              configfile, file=sys.stderr)
    except cp.NoOptionError:
        pass  # let's not warn about missing authors blacklist

    try:
        # pylint: disable=no-member
        blacklist['titles'] = config.get('blacklist', 'titles').split("\n")
    except cp.NoSectionError:
        pass  # we already warned about that
    except cp.NoOptionError:
        pass  # let's not warn about missing titles blacklist

    try:
        # pylint: disable=no-member
        blacklist['urls'] = config.get('blacklist', 'urls').split("\n")
    except cp.NoSectionError:
        pass  # we already warned about that
    except cp.NoOptionError:
        pass  # let's not warn about missing urls blacklist

    # Remove empty elements from the blacklist
    for field in ['authors', 'titles', 'urls']:
        if blacklist[field]:
            for i in reversed(range(len(blacklist[field]))):
                if not blacklist[field][i]:
                    del blacklist[field][i]
    return blacklist


def download_feed(url):
    # pylint: disable=too-many-return-statements
    request = Request(url, headers={'Accept-encoding': 'gzip'})
    try:
        response = urlopen(request)
    except urllib.error.HTTPError as err:
        print("Error: '%s' cannot be fetched (HTTPError): %s" % (url, err),
              file=sys.stderr)
        return None
    except urllib.error.URLError as err:
        print("Error: '%s' cannot be fetched (URLError): %s" % (url, err),
              file=sys.stderr)
        return None
    except TimeoutError as err:
        print("Error: '%s' cannot be fetched (TimeoutError): %s" % (url, err),
              file=sys.stderr)
        return None
    except http.client.BadStatusLine as err:
        print("Error: '%s' cannot be fetched (BadStatusLine): %s" % (url, err),
              file=sys.stderr)
        return None

    if response.info().get('Content-Encoding') == 'gzip':
        # print("Note: compressed response for '%s'" % url, file=sys.stderr)
        try:
            buf = io.BytesIO(response.read())
        except http.client.IncompleteRead:
            print("Error: cannot decompress gzipped response", file=sys.stderr)
            return None
        response = gzip.GzipFile(fileobj=buf)

    contents = None
    try:
        contents = response.read()
    except http.client.IncompleteRead as err:
        print("Warning: '%s' cannot be fully read: %s" % (url, err),
              file=sys.stderr)
    if not contents:
        print("Error: '%s' could not be downloaded" % url, file=sys.stderr)
        return None

    return contents


def remove_html_entities(contents):
    try:
        ret = contents.decode('utf-8')
    except UnicodeDecodeError as err:
        print("Warning: not a valid UTF-8 document (%s), trying ISO-8859-1"
              % err, file=sys.stderr)
        ret = contents.decode('iso-8859-1')

    # Prevent some entities from being replaced
    ret = ret.replace('&amp;', 'MAGICTOKEN-AMPERSAND-MAGICTOKEN')
    ret = ret.replace('&lt;', 'MAGICTOKEN-LESSTHAN-MAGICTOKEN')
    ret = ret.replace('&gt;', 'MAGICTOKEN-GREATERTHAN-MAGICTOKEN')

    # Built-in Python 3.4 function
    ret = html.unescape(ret)

    # Look for any unescaped ampersands
    ret = ret.replace('&', '&amp;')

    # Restore the required entities
    ret = ret.replace('MAGICTOKEN-AMPERSAND-MAGICTOKEN', '&amp;')
    ret = ret.replace('MAGICTOKEN-LESSTHAN-MAGICTOKEN', '&lt;')
    ret = ret.replace('MAGICTOKEN-GREATERTHAN-MAGICTOKEN', '&gt;')

    return ret


def parse_feed(contents, url):
    document = None

    try:
        document = minidom.parseString(contents)
    except xml.parsers.expat.ExpatError as err:
        print("Warning: '%s' is not a valid feed (%s)" % (url, err),
              file=sys.stderr)
        document = None

    if document:
        return document  # early exit for valid feeds

    # Try fixing HTML entities
    noentities = remove_html_entities(contents)

    try:
        document = minidom.parseString(noentities)
    except xml.parsers.expat.ExpatError as err:
        print("Error: '%s' is not a valid feed, even with HTML entities "
              "removed (%s)" % (url, err), file=sys.stderr)
        document = None

    return document


def process_config(configfile, outfile, overwrite):
    """Read a config file, fetch its feed and filter it."""
    if outfile and os.path.isfile(outfile) and not overwrite:
        print("Error: '%s' already exists, use --force to overwrite" % outfile,
              file=sys.stderr)
        return False

    config = cp.SafeConfigParser()
    with codecs.open(configfile, 'r', 'utf-8') as configfh:
        config.read_file(configfh)

    url = read_config_url(config, configfile)
    if not url:
        return False  # fatal error
    blacklist = read_config_blacklist(config, configfile)

    contents = download_feed(url)
    if not contents:
        if outfile and os.path.isfile(outfile):
            os.remove(outfile)
        return True  # non-fatal error

    document = parse_feed(contents, url)
    if not document:
        if outfile and os.path.isfile(outfile):
            os.remove(outfile)
        return False

    filter_feed(document, blacklist)

    if outfile:
        try:
            with codecs.open(outfile, 'w', 'utf-8') as outfh:
                outfh.write(document.toxml())
        except PermissionError:
            print("Error: no enough permissions to write to '%s'" % outfile,
                  file=sys.stderr)
            return False
    else:
        print(document.toxml())
    return True


def main():
    parser = argparse.ArgumentParser(
        description='Blacklist-based filter for blog aggregators.')
    parser.add_argument('configfile', type=str,
                        help='the config file to parse')
    parser.add_argument('-o', '--output', metavar='file',
                        required=False, type=str,
                        help='the output filename (default: <STDOUT>)')
    parser.add_argument('-f', '--force', dest='force', action='store_true',
                        help='overwrite the destination file')
    parser.add_argument('-V', '--version', action='version',
                        version='planetfilter %s' % VERSION)
    args = parser.parse_args()

    if not os.path.isfile(args.configfile):
        print("Error: '%s' not found" % args.configfile, file=sys.stderr)
        return False
    return process_config(args.configfile, args.output, args.force)

if main():
    exit(0)
else:
    exit(1)
