tv_grab_nl.py

#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
SYNOPSIS

tv_grab_nl_py is a python script that trawls tvgids.nl for TV
programming information and outputs it in XMLTV-formatted output (see
http://membled.com/work/apps/xmltv). Users of MythTV
(http://www.mythtv.org) will appreciate the output generated by this
grabber, because it fills the category fields, i.e. colors in the EPG,
and has logos for most channels automagically available. Check the
website below for screenshots.  The newest version of this script can be
found here:

     https://github.com/macfreek/tvgrabnlpy

This is a fork of the original code at

     http://code.google.com/p/tvgrabnlpy/

USAGE

Check the web site above and/or run script with --help and start from there

HISTORY

tv_grab_nl_py used to be called tv_grab_nl_pdb, created by Paul de Bruin
and first released on 2003/07/09. At the same time the code base switched
from using CVS to SVN at Google Code, and as a result the version numbering
scheme has changed. The lastest official release of tv_grab_nl_pdb is 0.48.
The first official release of tv_grab_nl_py is 6. In 2012, The codebase
moved to GitHub, and the version number was changed once more. The latest
subversion release of tv_grab_nl_py is r109. The first Git release of
tv_grab_nl_py is 2012-03-11 12:03.

QUESTIONS

Questions (and patches) are welcome at:
http://www.pwdebruin.net/mailman/listinfo/tv_grab_nl_py_pwdebruin.net
https://github.com/macfreek/tvgrabnlpy/issues

IMPORTANT NOTES

If you were using tv_grab_nl from the XMLTV bundle then enable the
compat flag or use the --compat command-line option.  Otherwise, the
xmltvid's are wrong and you will not see any new data in MythTV.

CONTRIBUTORS

Main author: Paul de Bruin (paul at pwdebruin dot net)
Current maintainer: Freek Dijkstra (software at macfreek dot nl)

Michel van der Laan made available his extensive collection of
high-quality logos that is used by this script.

Michael Heus has taken the effort to further enhance this script so that
it now also includes:
 - Credit info: directors, actors, presenters and writers
 - removal of programs that are actually just groupings/broadcasters 
   (e.g. "KETNET", "Wild Friday", "Z@pp")
 - Star-rating for programs tipped by tvgids.nl
 - Black&White, Stereo and URL info
 - Better detection of Movies
 - and much, much more... 

Several other people have provided feedback and patches (these are the
people I could find in my email archive, if you are missing from this
list let me know):
Huub Bouma, Roy van der Kuil, Remco Rotteveel, Mark Wormgoor, Dennis van
Onselen, Hugo van der Kooij, Han Holl, Ian Mcdonald, Udo van den Heuvel,
Paul Sijben, Sietse Visser.
""" 

# Python 3 compatibility
from __future__ import unicode_literals
# from __future__ import print_function

# Modules we need
import re, getopt, sys, codecs
import time, random
import os, os.path, pickle
try:
    import urllib.request as urllib
except ImportError:
    import urllib2 as urllib
try:
    from html.entities import name2codepoint
except ImportError:
    from htmlentitydefs import name2codepoint
from threading import Thread
from xml.sax import saxutils
import io
import json
try:
    unichr(42)
except NameError:
    unichr = chr    # Python 3

# Extra check for the datetime module 
try:
    import datetime
except ImportError:
    sys.stderr.write('This script needs the datetime module that was introduced in Python version 2.3.\n')
    sys.stderr.write('You are running:\n')
    sys.stderr.write('%s\n' % sys.version)
    raise

import sys,codecs,locale


VERSION = "2012-03-11 12:03"


# XXX: fix to prevent crashes in Snow Leopard [Robert Klep]
if sys.platform == 'darwin' and sys.version_info[:3] == (2, 6, 1):
    try:
        urllib.urlopen('http://localhost.localdomain')
    except Exception:
        pass


# globals
# compile only one time
r_entity = re.compile(r'&(#x[0-9A-Fa-f]+|#[0-9]+|[A-Za-z]+);')

tvgids = 'http://www.tvgids.nl/'
channels_zoeken = tvgids + 'json/lists/channels.php'
uitgebreid_zoeken = tvgids + 'json/lists/programs.php'

# how many seconds to wait before we timeout on a 
# url fetch, 10 seconds seems reasonable
global_timeout = 10

# Wait a random number of seconds between each page fetch.
# We want to be nice and not hammer tvgids.nl (these are the 
# friendly people that provide our data...).
# Also, it appears tvgids.nl throttles its output.
# So there, there is not point in lowering these numbers, if you 
# are in a hurry, use the (default) fast mode.
nice_time = [1, 2]

# Maximum length in minutes of gaps/overlaps between programs to correct
max_overlap = 10

# Strategy to use for correcting overlapping prgramming:
# 'average' = use average of stop and start of next program
# 'stop'    = keep stop time of current program and adjust start time of next program accordingly
# 'start'   = keep start time of next program and adjust stop of current program accordingly
# 'none'    = do not use any strategy and see what happens
overlap_strategy = 'average'

# Experimental strategy for clumping overlapping programming, all programs that overlap more
# than max_overlap minutes, but less than the length of the shortest program are clumped 
# together. Highly experimental and disabled for now.
do_clump = False

# Create a category translation dictionary
# Look in mythtv/themes/blue/ui.xml for all category names
# The keys are the categories used by tvgids.nl (lowercase please)
cattrans = { 'amusement'        : 'Talk',
             'animatie'         : 'Animated',
             'comedy'           : 'Comedy',
             'documentaire'     : 'Documentary',
             'educatief'        : 'Educational',
             'erotiek'          : 'Adult',
             'film'             : 'Film',
             'muziek'           : 'Art/Music',
             'informatief'      : 'Educational',
             'jeugd'            : 'Children',
             'kunst/cultuur'    : 'Arts/Culture',
             'misdaad'          : 'Crime/Mystery',
             'muziek'           : 'Music',
             'natuur'           : 'Science/Nature',
             'nieuws/actualiteiten' : 'News',
             'overige'          : 'Unknown',
             'religieus'        : 'Religion',
             'serie/soap'       : 'Drama',
             'sport'            : 'Sports',
             'theater'          : 'Arts/Culture',
             'wetenschap'       : 'Science/Nature'}

# Create a role translation dictionary for the xmltv credits part
# The keys are the roles used by tvgids.nl (lowercase please)
roletrans = {'regisseur'         : 'director',
             'regie'             : 'director',
             'acteurs'           : 'actor',
             'presentatie'       : 'presenter',
             'scenario'          : 'writer'}

# We have two sources of logos, the first provides the nice ones, but is not 
# complete. We use the tvgids logos to fill the missing bits.
logo_provider = [ 'http://visualisation.tudelft.nl/~paul/logos/gif/64x64/',
                  'http://static.tvgids.nl/gfx/zenders/' ]

logo_names = { 
            1 : [0, 'ned1'],
            2 : [0, 'ned2'],
            3 : [0, 'ned3'],
            4 : [0, 'rtl4'],
            5 : [0, 'een'],
            6 : [0, 'canvas_color'],
            7 : [0, 'bbc1'],
            8 : [0, 'bbc2'],
            9 : [0,'ard'],
            10 : [0,'zdf'],
            11 : [1, 'rtl'],
            12 : [0, 'wdr'],
            13 : [1, 'ndr'],
            14 : [1, 'srsudwest'],
            15 : [1, 'rtbf1'],
            16 : [1, 'rtbf2'],
            17 : [0, 'tv5'],
            18 : [0, 'ngc'],
            19 : [1, 'eurosport'],
            20 : [1, 'tcm'],
            21 : [1, 'cartoonnetwork'],
            24 : [0, 'canal+red'],
            25 : [0, 'mtv-color'],
            26 : [0, 'cnn'],
            27 : [0, 'rai'],
            28 : [1, 'sat1'],
            29 : [0, 'discover-spacey'],
            31 : [0, 'rtl5'],
            32 : [1, 'trt'],
            34 : [0, 'veronica'],
            35 : [0, 'tmf'],
            36 : [0, 'sbs6'],
            37 : [0, 'net5'],
            38 : [1, 'arte'],
            39 : [0, 'canal+blue'],
            40 : [0, 'at5'],
            46 : [0, 'rtl7'],
            49 : [1, 'vtm'],
            50 : [1, '3sat'],
            58 : [1, 'pro7'],
            59 : [1, 'kanaal2'],
            60 : [1, 'vt4'],
            65 : [0, 'animal-planet'],
            73 : [1, 'mezzo'],
            86 : [0, 'bbc-world'],
            87 : [1, 'tve'],
            89 : [1, 'nick'],
            90 : [1, 'bvn'],
            91 : [0, 'comedy_central'],
            92 : [0, 'rtl8'],
            99 : [1, 'sport1_1'],
            100 : [0, 'rtvu'],
            101 : [0, 'tvwest'],
            102 : [0, 'tvrijnmond'],
            103 : [1, 'tvnoordholland'],
            104 : [1, 'bbcprime'],
            105 : [1, 'spiceplatinum'],
            107 : [0, 'canal+yellow'],
            108 : [0, 'tvnoord'],
            109 : [0, 'omropfryslan'],
            114 : [0, 'omroepbrabant']}

# A selection of user agents we will impersonate, in an attempt to be less
# conspicuous to the tvgids.nl police.

user_agents = [ 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)',
       'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9',
       'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322)',
       'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.0.7) Gecko/20060909 Firefox/1.5.0.7',
       'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0)',
       'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.9) Gecko/20071105 Firefox/2.0.0.9',
       'Mozilla/5.0 (Macintosh; U; Intel Mac OS X; en-US; rv:1.8.1.9) Gecko/20071025 Firefox/2.0.0.9',
       'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.1.8) Gecko/20071022 Ubuntu/7.10 (gutsy) Firefox/2.0.0.8'
       ]


def log(message, quiet=False):
    # Prints a warning to stderr.
    # Note: The function encodes all ouput to utf-8. This may be wrong.
    # TODO: use sys.stdout.encoding, locale.getpreferredencoding(), sys.getfilesystemencoding(), and/or 
    #       os.environ["PYTHONIOENCODING"] to determine the correct encoding.
    # TODO: use logging module
    if not quiet:
        sys.stderr.write(message.encode("utf-8"))


# Work in progress, the idea is to cache program categories and
# descriptions to eliminate a lot of page fetches from tvgids.nl
# for programs that do not have interesting/changing descriptions

class ProgramCache:
    """
    A cache to hold program name and category info.
    TVgids stores the detail for each program on a separate URL with an
    (apparently unique) ID. This cache stores the fetched info with the ID.
    New fetches will use the cached info instead of doing an (expensive)
    page fetch.
    """
    def __init__(self, filename=None):
        """
        Create a new ProgramCache object, optionally from file 
        """

        # where we store our info
        self.filename  = filename

        if filename == None:
            self.pdict = {}
        else:
            if os.path.isfile(filename):
                self.load(filename)
            else:
                self.pdict = {}


    def load(self, filename):
        """
        Loads a pickled cache dict from file
        """
        try:
            self.pdict = pickle.load(open(filename,'r'))
        except Exception:
            log('Error loading cache file: %s (possibly corrupt)' % filename)
            self.clear()

    def dump(self, filename):
        """
        Dumps a pickled cache, and makes sure it is valid
        """
        if os.access(filename, os.F_OK):
            try:
                os.remove(filename)
            except Exception:
                log('Cannot remove %s, check permissions' % filename)
        pickle.dump(self.pdict, open(filename+'.tmp', 'w'))
        os.rename(filename+'.tmp', filename)

    
    def query(self, program_id):
        """
        Updates/gets/whatever.
        """

        try:
            return self.pdict[program_id]
        except LookupError:
            return None

    def add(self, program):
        """
        Adds a program
        """
        self.pdict[program['ID']] = program

    def clear(self):
        """
        Clears the cache (i.e. empties it)
        """
        self.pdict = {}

    def clean(self):
        """
        Removes all cached programming before today.
        Also removes erroneously cached programming.
        """
        now = time.localtime() 
        dnow = datetime.datetime(now[0],now[1],now[2])
        for key in self.pdict.keys():
            try:
                if self.pdict[key]['stop-time'] < dnow or self.pdict[key]['name'].lower() == 'onbekend':
                    del self.pdict[key]
            except LookupError:
                pass    


def usage():
    print 'tv_grab_nl_py: A grabber that grabs tvguide data from tvgids.nl\n'
    print 'and stores it in XMLTV-combatible format.\n'
    print 'Usage:'
    print '--help, -h    = print this info'
    print '--configure   = create configfile (overwrites existing file)'
    print '--config-file = name of the configuration file (default = ~/.xmltv/tv_grab_py.conf'
    print '--capabilities = xmltv required option'
    print '--desc-length = maximum allowed length of programme descriptions in bytes.'
    print '--description = prints a short description of the grabber'
    print '--output      = file where to put the output'
    print '--days        = # number of days to grab'
    print '--preferredmethod = returns the preferred method to be called'
    print '--fast        = do not grab descriptions of programming'
    print '--slow        = grab descriptions of programming'
    print '--quiet       = suppress all output'
    print '--compat      = append tvgids.nl to the xmltv id (use this if you were using tv_grab_nl)'
    print '--logos 0/1   = insert urls to channel icons (mythfilldatabase will then use these)'
    print '--nocattrans  = do not translate the grabbed genres into MythTV-genres'
    print '--cache       = cache descriptions and use the file to store'
    print '--clean_cache = clean the cache file before fetching'
    print '--clear_cache = empties the cache file before fetching data'
    print '--slowdays    = grab slowdays initial days and the rest in fast mode'
    print '--max_overlap = maximum length of overlap between programming to correct [minutes]'
    print '--overlap_strategy = what strategy to use to correct overlaps (check top of source code)'
    print '--utc         = generate all data in UTC time (use with timezone "auto" in mythtv)'

def filter_line_identity(m, defs=name2codepoint):
    # callback: translate one entity to its Unicode value
    k = m.group(1)
    try:
        if k.startswith("#x"):
            return unichr(int(k[1:], 16))
        elif k.startswith("#"):
            return unichr(int(k[1:]))
        else:
            return unichr(defs[k])
    except KeyError:
        return m.group(0) # use as is

def filter_line(s):
    """
    Removes unwanted stuff in strings (adapted from tv_grab_be)
    """

    # convert escapse HTML entities to their Unicode equivalent
    s = r_entity.sub(filter_line_identity, s)

    s.replace('&nbsp;',' ')

    # Ik vermoed dat de volgende drie regels overbodig zijn, maar ze doen
    # niet veel kwaad -- Han Holl
    s.replace('\r',' ')
    x = re.compile('(<.*?>)') # Udo
    s = x.sub('', s) #Udo

    s.replace('~Q', "'")
    s.replace('~R', "'")

    # Hmm, not sure if I understand this. Without it, mythfilldatabase barfs
    # on program names like "Steinbrecher &..."
    # We must create valid XML -- Han Holl
    s = saxutils.escape(s)

    return s
    

def calc_timezone(t, use_utc):
    """
    Takes a time from tvgids.nl and formats it with all the required
    timezone conversions.
    in: '20050429075000'
    out:'20050429065000 (CET|CEST|UTC)'

    """

    year = int(t[0:4])
    month = int(t[4:6])
    day = int(t[6:8])
    hour = int(t[8:10])
    minute = int(t[10:12])

    timestamp = ''

    if use_utc:
        # evil: set the TZ environment to amsterdam... reset it back later
        # I can't think of a less evil way to do this in the current python [Huub]
        old_tz = os.environ.get('TZ')
        os.environ['TZ'] = 'Europe/Amsterdam'
        time.tzset()
        pt = time.mktime((year,month,day,hour,minute,0,0,0,-1))
        localtime = time.localtime(pt)
        utc = time.gmtime(pt)
        utc_stamp = time.strftime('%Y%m%d%H%M00', utc)
        if old_tz:
            os.environ['TZ'] = old_tz
        else:
            del os.environ['TZ']
        time.tzset()
        timestamp = '%s %s' % (utc_stamp, 'UTC')
    else:
        td = {0 : '+0100', 1 : '+0200'}
    pt = time.mktime((year,month,day,hour,minute,0,0,0,-1))
    timezone=''
    try:
        timezone = (time.localtime(pt))[-1]
    except:
        sys.stderr.write('Cannot convert time to timezone')
        timestamp = t+' %s' % td[timezone]

    return timestamp

def format_timezone(td, use_utc):
    """
    Given a datetime object, returns a string in XMLTV format
    """
    tstr = td.strftime('%Y%m%d%H%M00')
    return calc_timezone(tstr, use_utc)

def get_page_internal(url, quiet=0):
    """
    Retrieves the url and returns a string with the contents.
    Optionally, returns None if processing takes longer than
    the specified number of timeout seconds.
    """
    txtdata = None
    txtheaders = {'Keep-Alive' : '300',
                  'User-Agent' : user_agents[random.randint(0, len(user_agents)-1)] }
    try:
        #fp = urllib.urlopen(url)
        rurl = urllib.Request(url, txtdata, txtheaders)
        fp = urllib.urlopen(rurl)
        bytes = fp.read()
        page = None
        try:
            page = bytes.decode('iso-8859-1', 'strict') # This is what tvgids.nl currently uses as encoding
            # TODO: the encoding should be determined from the HTTP headers and/or the HTML head.
        except UnicodeDecodeError:
            log('Cannot decode url: %s\n' % url, quiet)
            page = bytes.decode('utf-8', 'replace') # At least gets the ASCII correct
        
        return page
    except Exception:
        log('Cannot open url: %s\n' % url, quiet)
        return None

class FetchURL(Thread):
    """
    A simple thread to fetch a url with a timeout
    """
    def __init__ (self, url, quiet=0):
        Thread.__init__(self)
        self.quiet = quiet
        self.url = url
        self.result = None

    def run(self):
        self.result = get_page_internal(self.url, self.quiet)

def get_page(url, quiet=0):
    """
    Wrapper around get_page_internal to catch the
    timeout exception
    """
    try: 
        fu = FetchURL(url, quiet)
        fu.start()
        fu.join(global_timeout)
        page = fu.result.translate("\n\t") # remove tabs and returns
        return page
    except Exception:
        log('get_page timed out on (>%s s): %s\n' % (global_timeout, url), quiet)
        return None

def get_channels(file, quiet=0):
    """
    Get a list of all available channels and store these
    in a file.
    """

    # download the json feed
    total = get_page(channels_zoeken, quiet)
    channel_list = json.loads(total)

    # convert to a map, so we can sort it..
    channels = {}

    # the json data has the channel names in XML entities.
    # if lxml is available, convert it to UTF-8
    # so 'Frysl&acirc;n' will become 'Fryslân'

    try:
        import lxml.html
        has_lxml = True
    except:
        has_lxml = False

    for channel in channel_list:
        name = channel['name']
        if has_lxml:
            #name = lxml.html.fromstring(name).text.encode('utf-8')
            name = lxml.html.fromstring(name).text
        channels[int(channel['id'])] = name

    # sort on channel number (arbitrary but who cares)
    keys = channels.keys()
    keys.sort()

    # and create a file with the channels
    f = open(file,'w')
    f.write("# encoding: utf-8\n")
    for k in keys:
        regel = "%s %s\n" % (k, channels[k])
        f.write(regel.encode('utf-8'))
    f.close()

def get_channel_all_days(channel, days, quiet=0):
    """
    Get all available days of programming for channel number

    The output is a list of programming in order where each row
    contains a dictionary with program information.
    """
    
    now = datetime.datetime.now()
    
    programs = []

    # Tvgids shows programs per channel per day, so we loop over the number of days
    # we are required to grab
    for offset in range(0, days):
    
        channel_url = uitgebreid_zoeken + '?channels=%s&day=%s' % (channel, offset)

        if offset > 0:
                time.sleep(random.randint(nice_time[0], nice_time[1]))
        # get the raw programming for the day
        req = urllib.Request(channel_url)
        opener = urllib.build_opener()
        response = opener.open(req)
        data = response.read()
        if not data:
            return programs
        # TODO: determine encoding from HTTP headers. (how does urllib handle that?)
        # The headers currently include "Content-Type: application/json; charset=utf-8", so we use utf-8.
        strdata = data.decode('utf-8', 'ignore')
        total = json.loads(strdata)

        expected = now + datetime.timedelta(days=offset)
        
        # and find relevant programming info
        v = list(total.values())[0]
        if isinstance(v, dict):
                v=list(v.values())
        for r in v:
                program_url  = 'http://www.tvgids.nl/programma/' + r['db_id'] + '/'
                tdict = {}
                tdict['start'] = r['datum_start'][10:-3]
                tdict['stop']  = r['datum_end'][10:-3]
                tdict['name']  = r['titel']
                if tdict['name'] == '':
                        dict['name'] = 'onbekend'
                tdict['url']   = program_url
                tdict['ID']    = r['db_id']
                tdict['offset'] = offset
                tdict['genre'] = r['genre']
                # and append the program to the list of programs
                programs.append(tdict)
    # done
    return programs

def make_daytime(time_string, offset=0, cutoff='00:00', stoptime=False):
    """
    Given a string '11:35' and an offset from today,
    return a datetime object. The cuttoff specifies the point where the 
    new day starts.

    Examples:
    In [2]:make_daytime('11:34',0)
    Out[2]:datetime.datetime(2006, 8, 3, 11, 34)

    In [3]:make_daytime('11:34',1)
    Out[3]:datetime.datetime(2006, 8, 4, 11, 34)

    In [7]:make_daytime('11:34',0,'12:00')
    Out[7]:datetime.datetime(2006, 8, 4, 11, 34)

    In [4]:make_daytime('11:34',0,'11:34',False)
    Out[4]:datetime.datetime(2006, 8, 3, 11, 34)

    In [5]:make_daytime('11:34',0,'11:34',True)
    Out[5]:datetime.datetime(2006, 8, 4, 11, 34)

    """
    h,m = [int(x) for x in time_string.split(':')];
    hm = int(time_string.replace(':',''))
    chm = int(cutoff.replace(':',''))

    # check for the cutoff, if the time is before the cutoff then 
    # add a day
    extra_day = 0
    if (hm < chm) or (stoptime==True and hm == chm):
        extra_day = 1

    # and create a datetime object, DST is handled at a later point
    pt = time.localtime()
    dt = datetime.datetime(pt[0],pt[1],pt[2],h,m)
    dt = dt + datetime.timedelta(offset+extra_day)
    return dt

def correct_times(programs, quiet=0):
    """
    Parse a list of programs as generated by get_channel_all_days()  and
    convert begin and end times to xmltv compatible times in datetime objects.  
    """
    if programs == []:
        return programs
    
    # the start time of programming for this day, times *before* this time are 
    # assumed to be on the next day
    day_start_time = '06:00'

    # initialise using the start time of the first program on this day
    if programs[0]['start'] != None:
        day_start_time = programs[0]['start']

    for program in programs:
        if program['start'] == program['stop']:
            program['stop'] = None

        # convert the times 
        if program['start'] != None:
            program['start-time'] = make_daytime(program['start'], program['offset'], day_start_time)
        else:
            program['start-time'] = None

        if program['stop'] != None:
            program['stop-time'] = make_daytime(program['stop'], program['offset'], day_start_time, stoptime=True)

            # extra correction, needed because the stop time of a program may be on the next day, after the
            # day cutoff. For example: 
            # 06:00 - 23:40 Long Program
            # 23:40 - 00:10 Lala
            # 00:10 - 08:00 Wawa 
            # This puts the end date of Wawa on the current, instead of the next day. There is no way to detect
            # this with a single cutoff in make_daytime. Therefore, check if there is a day difference between
            # start and stop dates and correct if necessary.
            if program['start-time'] != None:
                # make two dates
                start = program['start-time']
                stop  = program['stop-time']
                single_day = datetime.timedelta(1)
                startdate = datetime.datetime(start.year,start.month,start.day)
                stopdate  = datetime.datetime(stop.year,stop.month,stop.day)
                if startdate - stopdate == single_day:
                    program['stop-time'] = program['stop-time'] + single_day
        else:
            program['stop-time'] = None

def parse_programs(programs, offset=0, quiet=0):
    """
    Parse a list of programs as generated by get_channel_all_days()  and
    convert begin and end times to xmltv compatible times.  
    """

    # good programs
    good_programs = []

    # calculate absolute start and stop times
    correct_times(programs, quiet)

    # next, correct for missing end time and copy over all good programming to the 
    # good_programs list
    for i in range(len(programs)):

        # Try to correct missing end time by taking start time from next program on schedule
        if (programs[i]['stop-time'] == None and i < len(programs)-1):
            log('Oops, "%s" has no end time. Trying to fix...\n' % programs[i]['name'], quiet)
            programs[i]['stop-time'] = programs[i+1]['start-time']

        # The common case: start and end times are present and are not
        # equal to each other (yes, this can happen)
        if programs[i]['start-time'] != None and \
           programs[i]['stop-time']  != None and \
           programs[i]['start-time'] != programs[i]['stop-time']:
            good_programs.append(programs[i])

    # Han Holl: try to exclude programs that stop before they begin
    for i in range(len(good_programs)-1,-1,-1):
        if good_programs[i]['stop-time'] <= good_programs[i]['start-time']:
            log('Deleting invalid stop/start time: %s\n' % good_programs[i]['name'], quiet)
            del good_programs[i]

    # Try to exclude programs that only identify a group or broadcaster and have overlapping start/end times with
    # the actual programs
    for i in range(len(good_programs)-2,-1,-1):
          
        if good_programs[i]['start-time'] <= good_programs[i+1]['start-time'] and \
           good_programs[i]['stop-time']  >= good_programs[i+1]['stop-time']:
            log('Deleting grouping/broadcaster: %s\n' % good_programs[i]['name'], quiet)
            del good_programs[i]

    for i in range(len(good_programs)-1):

        # PdB: Fix tvgids start-before-end x minute interval overlap.  An overlap (positive or
        # negative) is halved and each half is assigned to the adjacent programmes. The maximum
        # overlap length between programming is set by the global variable 'max_overlap' and is 
        # default 10 minutes. Examples:
        #
        # Positive overlap (= overlap in programming):
        #   10:55 - 12:00 Lala
        #   11:55 - 12:20 Wawa
        # is transformed in:
        #   10:55 - 11.57 Lala
        #   11:57 - 12:20 Wawa
        # 
        # Negative overlap (= gap in programming):
        #   10:55 - 11:50 Lala
        #   12:00 - 12:20 Wawa
        # is transformed in:
        #   10:55 - 11.55 Lala
        #   11:55 - 12:20 Wawa
         
        stop  = good_programs[i]['stop-time']
        start = good_programs[i+1]['start-time']
        dt    = stop-start
        avg   = start + dt // 2
        overlap = 24*60*60*dt.days + dt.seconds

        # check for the size of the overlap
        if 0 < abs(overlap) <= max_overlap*60:
            if overlap > 0:
                log('"%s" and "%s" overlap %s minutes. Adjusting times.\n' % \
                    (good_programs[i]['name'],good_programs[i+1]['name'],overlap // 60), quiet)
            else:
                log('"%s" and "%s" have gap of %s minutes. Adjusting times.\n' % \
                    (good_programs[i]['name'],good_programs[i+1]['name'],abs(overlap) // 60), quiet)

            # stop-time of previous program wins
            if overlap_strategy == 'stop':
               good_programs[i+1]['start-time'] = good_programs[i]['stop-time']
            # start-time of next program wins
            elif overlap_strategy == 'start':
               good_programs[i]['stop-time'] = good_programs[i+1]['start-time']
            # average the difference
            elif overlap_strategy == 'average':
               good_programs[i]['stop-time']    = avg
               good_programs[i+1]['start-time'] = avg
            # leave as is
            else:
               pass

    # Experimental strategy to make sure programming does not disappear. All programs that overlap more
    # than the maximum overlap length, but less than the shortest length of the two programs are 
    # clumped.
    if do_clump:
        for i in range(len(good_programs)-1):
         
            stop  = good_programs[i]['stop-time']
            start = good_programs[i+1]['start-time']
            dt    = stop-start
            overlap = 24*60*60*dt.days + dt.seconds

            length0 = good_programs[i]['stop-time']   - good_programs[i]['start-time']
            length1 = good_programs[i+1]['stop-time'] - good_programs[i+1]['start-time']

            l0 = length0.days*24*60*60 + length0.seconds    
            l1 = length1.days*24*60*60 + length0.seconds    

            if abs(overlap) >= max_overlap*60 <= min(l0,l1)*60 and \
                'clumpidx' not in good_programs[i]   and \
                'clumpidx' not in good_programs[i+1]:
                good_programs[i]['clumpidx']   = '0/2'
                good_programs[i+1]['clumpidx'] = '1/2'
                good_programs[i]['stop-time'] = good_programs[i+1]['stop-time']
                good_programs[i+1]['start-time'] = good_programs[i]['start-time']
            

    # done, nothing to see here, please move on 
    return good_programs

def get_descriptions(programs, program_cache=None, nocattrans=0, quiet=0, slowdays=0):
    """
    Given a list of programs, from get_channel, retrieve program information
    """

    # This regexp tries to find details such as Genre, Acteurs, Jaar van Premiere etc.
    detail      = re.compile('<li>.*?<strong>(.*?):</strong>(.*?)</li>', re.DOTALL)

    # These regexps find the main description area and lines of descriptive text in this area
    description = re.compile('<div id="prog-content">(.*?)</div>',re.DOTALL)
    descrline = re.compile('<p>(.*?)</p>',re.DOTALL)

    # These regexps try to find the subgenre of the program, e.g. Fantasy-familiefilm, Comedyserie, 
    # Woonprogramma, Culinair Programma etc.
    # descrtype searches for the subgenre in the description area, e.g.:
    #      <strong>Woonprogramma</strong><p>Nance, Tooske, Ellemieke, Marlayne en Viktor helpen mensen...
    #
    # addprogtype searches for the subgenre in the special "mijn TV agenda" link, e.g.:
    #      <a href="#perstvgids" title="Plaats dit programma in mijn TV Agenda" 
    #                  onclick="addProg('10281755','Informatief','Woonprogramma.',event);return false;">

    descrtype = re.compile('<strong>([^<]*)</strong>',re.DOTALL)
    addprogtype = re.compile("addProg\(.*?,.*?,'(.*?)',.*?\)",re.DOTALL)

    # randomize detail requests
    nprograms = len(programs)
    fetch_order = list(range(0,nprograms))
    random.shuffle(fetch_order)

    counter = 0
    for i in fetch_order:
        counter += 1
        if programs[i]['offset'] >= slowdays:
            continue
        
        log('\n(%3.0f%%) %s: %s ' % (100*float(counter)/float(nprograms), i, programs[i]['name']), quiet)

        # check the cache for this program's ID
        cached_program = program_cache.query(programs[i]['ID'])
        if (cached_program != None):
                log(' [cached]', quiet)
                # copy the cached information, except the start/end times, rating and clumping, 
                # these may have changed.
                tstart = programs[i]['start-time']
                tstop  = programs[i]['stop-time']
                rating = '' #programs[i]['star-rating']
                try:
                    clump  = programs[i]['clumpidx']
                except LookupError:
                    clump = False
                programs[i] = cached_program
                programs[i]['start-time'] = tstart
                programs[i]['stop-time']  = tstop
                programs[i]['star-rating'] = rating
                if clump:
                    programs[i]['clumpidx'] = clump
                continue
        else:
            # be nice to tvgids.nl
            time.sleep(random.randint(nice_time[0], nice_time[1]))

        # get the details page, and get all the detail nodes
        descriptions = ()
        details = ()
        try:
            log(' [normal fetch]', quiet)
            total = get_page(programs[i]['url'])
            details = detail.finditer(total)
            
            descrspan = description.search(total)
            descriptions = descrline.finditer(descrspan.group(1))
            
        except Exception as e:
            # if we cannot find the description page, 
            # go to next in the loop
            log(' [fetch failed or timed out]', quiet)
            continue
        # define containers
        programs[i]['credits'] = {}
        programs[i]['video']   = {}

        # now parse the details
        
        line_nr = 1;
        
        # First, we try to find the program type in the special "mijn TV Agenda" link, if not found there we
        # search for a type in the description section.
        # Note that this type is not the same as the generic genres (these are searched later on), 
        # but a more descriptive one like "Culinair programma" 
        # 

        programs[i]['detail1'] = ''
        if addprogtype.search(total) != None:
           programs[i]['detail1'] = addprogtype.search(total).group(1).capitalize()

        elif descrtype.search(descrspan.group(1)) != None:
           programs[i]['detail1'] = descrtype.search(descrspan.group(1)).group(1).capitalize()

        # If a type was found, we store this as first part of the regular detailed description and remove unwanted chars
        if programs[i]['detail1'] != '':
           programs[i]['detail1'] = filter_line(programs[i]['detail1'])
           line_nr = line_nr + 1

        # Secondly, we add one or more lines of the program description that are present.
    
        for descript in descriptions:
            # descript is a re.Match object
            d_str = 'detail' + str(line_nr)
            programs[i][d_str] = descript.group(1)

            # Remove sponsored link from description if present.
            sponsor_pos = programs[i][d_str].rfind('<i>Gesponsorde link:</i>')
            if sponsor_pos > 0:
                programs[i][d_str] = programs[i][d_str][0:sponsor_pos]

            programs[i][d_str] = filter_line(programs[i][d_str]).strip()
            line_nr = line_nr + 1
        
        # Finally, we check out all program details. These are generically denoted as:
        #
        #   <li><strong>(TYPE):</strong>(CONTENT)</li> 
        #
        # Some examples:
        #
        #   <li><strong>Datum:</strong>16 oktober 2008</li>
        #   <li><strong>Genre:</strong>Amusement</li>
                                                                            
        for d in details:
            ctype = d.group(1).strip().lower()
            content_asis = d.group(2).strip()
            content = filter_line(content_asis).strip()
            
            if content == '':
                continue

            elif ctype == 'genre':

                # Fix detection of movies based on description as tvgids.nl sometimes 
                # categorises a movie as e.g. "Komedie", "Misdaadkomedie", "Detectivefilm". 
                genre = content;
                if    (programs[i]['detail1'].lower().find('film')      != -1 \
                   or  programs[i]['detail1'].lower().find('komedie')   != -1)\
                   and programs[i]['detail1'].lower().find('tekenfilm') == -1 \
                   and programs[i]['detail1'].lower().find('animatiekomedie') == -1 \
                   and programs[i]['detail1'].lower().find('filmpje')   == -1:
                    genre = 'film'

                if nocattrans:
                    programs[i]['genre'] = genre.title()
                else:
                    try:
                        programs[i]['genre'] = cattrans[genre.lower()]
                    except LookupError:
                        programs[i]['genre'] = ''


            # Parse persons and their roles for credit info
            elif ctype in roletrans:
                programs[i]['credits'][roletrans[ctype]] = []

                persons = content_asis.split(',');

                for name in persons:
                    if name.find(':') != -1:
                        name = name.split(':')[1]
                    if name.find('-') != -1:
                        name = name.split('-')[0]
                    if name.find('e.a') != -1:
                        name = name.split('e.a')[0]
                    programs[i]['credits'][roletrans[ctype]].append(filter_line(name.strip()))

            elif ctype == 'bijzonderheden':
                if content.find('Breedbeeld') != -1:
                    programs[i]['video']['breedbeeld'] = 1
                if content.find('Zwart') != -1: 
                    programs[i]['video']['blackwhite'] = 1
                if content.find('Teletekst') != -1: 
                    programs[i]['teletekst'] = 1
                if content.find('Stereo') != -1: 
                    programs[i]['stereo'] = 1
            elif ctype == 'url':
                programs[i]['infourl'] = content
            else:
                # In unmatched cases, we still add the parsed type and content to the program details.
                # Some of these will lead to xmltv output during the xmlefy_programs step
                programs[i][ctype] = content

        # do not cache programming that is unknown at the time
        # of fetching.
        
        if programs[i]['name'].lower() != 'onbekend':
            program_cache.add(programs[i])

    log('\ndone...\n\n', quiet)
                    
    # done
      
def title_split(program):
    """
    Some channels have the annoying habit of adding the subtitle to the title of a program. 
    This function attempts to fix this, by splitting the name at a ': '.
    """

    # Some programs (BBC3 when this happened) have no genre. If none, then set to a default
    if program['genre'] is None:
        program['genre'] = 'overige';
    
    if  ('titel aflevering' in program and program['titel aflevering'] != '')  \
     or ('genre' in program and program['genre'].lower() in ['movies','film']):
       return

    colonpos =  program['name'].rfind(': ') 
    if colonpos > 0:
       program['titel aflevering'] = program['name'][colonpos+1:len(program['name'])].strip()
       program['name'] =  program['name'][0:colonpos].strip()

def xmlefy_programs(programs, channel, desc_len, compat=0, nocattrans=0, use_utc=0):
    """
    Given a list of programming (from get_channels())
    returns a unicode string with the xml equivalent
    """
    output = []
    for program in programs:

        clumpidx = ''
        try:
            if 'clumpidx' in program:
                clumpidx = 'clumpidx="'+program['clumpidx']+'"'
        except LookupError:
            clumpidx = ""

        output.append('  <programme start="%s" stop="%s" channel="%s%s" %s> \n' % \
            (format_timezone(program['start-time'], use_utc), format_timezone(program['stop-time'], use_utc),\
             channel, compat and '.tvgids.nl' or '', clumpidx))

        output.append('    <title lang="nl">%s</title>\n' % filter_line(program['name']))

        if 'titel aflevering' in program and program['titel aflevering'] != '':
                output.append('    <sub-title lang="nl">%s</sub-title>\n' % filter_line(program['titel aflevering']))

        desc = []
        for detail_row in ['detail1','detail2','detail3']:
                if detail_row in program and not re.search('[Gg]een detailgegevens be(?:kend|schikbaar)', program[detail_row]):
                        desc.append('%s ' % program[detail_row])
        if desc != []:
                # join and remove newlines from descriptions
                desc_line = "".join(desc).strip()
                desc_line.replace('\n', ' ')
                if len(desc_line) > desc_len: 
                    spacepos = desc_line[0:desc_len-3].rfind(' ') 
                    desc_line = desc_line[0:spacepos] + '...'
                output.append('    <desc lang="nl">%s</desc>\n' % desc_line)
        
        # Process credits section if present.
        # This will generate director/actor/presenter info.
        if 'credits' in program and program['credits'] != {}:
            output.append('    <credits>\n')
            for role in program['credits']:
                for name in program['credits'][role]:
                    if name != '':
                        output.append('       <%s>%s</%s>\n' % (role, name, role))
            output.append('    </credits>\n')

        if 'jaar van premiere' in program and program['jaar van premiere'] != '':
                output.append('    <date>%s</date>\n' % program['jaar van premiere'])

        if 'genre' in program and program['genre'] != '':
                output.append('    <category')
                if nocattrans:
                   output.append(' lang="nl"')
                output.append ('>%s</category>\n' % program['genre'])
        
        if 'infourl' in program and program['infourl'] != '':
                output.append('    <url>%s</url>\n' % program['infourl']) 

        if 'aflevering' in program and program['aflevering'] != '':
                output.append('    <episode-num system="onscreen">%s</episode-num>\n' % filter_line(program['aflevering']))

        # Process video section if present
        if 'video' in program and program['video'] != {}:
            output.append('    <video>\n');
            if 'breedbeeld' in program['video']:
                output.append('           <aspect>16:9</aspect>\n')
            if 'blackwhite' in program['video']:
                output.append('           <colour>no</colour>\n')
            output.append('    </video>\n')

        if 'stereo' in program:
            output.append('    <audio><stereo>stereo</stereo></audio>\n')
 
        if 'teletekst' in program:
            output.append('    <subtitles type="teletext" />\n')

        # Set star-rating if applicable
        #if program['star-rating'] != '':
        #     output.append('    <star-rating><value>%s</value></star-rating>\n' % program['star-rating'])
                
        output.append('  </programme>\n')
    return output


def main():

    # Parse command line options
    try:
        opts, args = getopt.getopt(sys.argv[1:], "h", ["help", "output=", "capabilities", 
                                                       "preferredmethod", "days=", 
                                                       "configure", "fast", "slow",
                                                       "cache=", "clean_cache", "utc",
                                                       "slowdays=","compat",
                                                       "desc-length=","description","version",
                                                       "nocattrans","config-file=",
                                                       "max_overlap=", "overlap_strategy=",
                                                       "clear_cache", "quiet","logos="])
    except getopt.GetoptError:
        usage()
        return(2)

    # DEFAULT OPTIONS - Edit if you know what you are doing

    # where the output goes
    output      = None
    output_file = None

    # the total number of days to fetch 
    days        = 6

    # Fetch data in fast mode, i.e. do NOT grab all the detail information,
    # fast means fast, because as it then does not have to fetch a web page for each program
    # Default: fast=0
    fast        = 0

    # number of days to fetch in slow mode. For example: --days 5 --slowdays 2, will 
    # fetch the first two days in slow mode (with all the details) and the remaining three
    # days in fast mode.
    slowdays    = 6

    # no output 
    quiet       = 0

    # insert url of channel logo into the xml data, this will be picked up by mythfilldatabase
    logos       = 1

    # enable this option if you were using tv_grab_nl, it adjusts the generated
    # xmltvid's so that everything works.
    compat      = 0
    
    # enable this option if you do not want the tvgids categories being translated into
    # MythTV-categories (genres)
    nocattrans  = 0

    # Maximum number of characters to use for program description.
    # Different values may work better in different versions of MythTV.
    desc_len = 475
 
    # default configuration file locations
    hpath = ''
    if 'HOME' in os.environ:
        hpath = os.environ['HOME']
    # extra test for windows users
    elif 'HOMEPATH' in os.environ:
        hpath = os.environ['HOMEPATH']

    # hpath = ''
    xmltv_dir   = hpath+'/.xmltv'

    program_cache_file = xmltv_dir+'/program_cache'
    config_file = xmltv_dir+'/tv_grab_nl_py.conf'

    # cache the detail information. 
    program_cache = None
    clean_cache = 1
    clear_cache = 0

    # don't convert all the program date/times to UTC (GMT) timezone.
    # by default the current timezone is Europe/Amsterdam. This works fine
    # if you are located in the Amsterdam timezone, but not if you live abroad
    # in another timezone. If you want to use the UTC timestamp in combination
    # with mythtv, be sure to set the timezone in mythtv to 'auto'
    # (TimeOffset in Settings table)
    use_utc = False

    # seed the random generator
    random.seed(time.time())

    for o, a in opts:
        if o in ("-h", "--help"):
            usage()
            return(1)

        if o == "--quiet":
            quiet = 1;

        if o == "--description" or o == "--version":
            print("The Netherlands (tv_grab_nl_py version %s)" % VERSION)
            return(0)

        if o == "--capabilities":
            print("baseline")
            print("cache")
            print("manualconfig")
            print("preferredmethod")
            return(0)

        if o == '--preferredmethod':
            print('allatonce')
            return(0)

        if o == '--desc-length':
            # Use the requested length for programme descriptions.
            desc_len = int(a)
            log('Using description length: %d\n' % desc_len, quiet)

    for o, a in opts:
        if o == "--config-file":
            # use the provided name for configuration
            config_file = a
            log('Using config file: %s\n' % config_file, quiet)

    for o, a in opts:
        if o == "--configure":
            # check for the ~.xmltv dir
            if not os.path.exists(xmltv_dir):
                log('You do not have the ~/.xmltv directory,', quiet)
                log('I am going to make a shiny new one for you...', quiet)
                os.mkdir(xmltv_dir)
            log('Creating config file: %s\n' % config_file, quiet)
            get_channels(config_file)
            return(0)

        if o == "--days":
            # limit days to maximum supported by tvgids.nl
            days = min(int(a),6)

        if o == "--compat":
            compat = 1

        if o == "--nocattrans":
            nocattrans = 1

        if o == "--fast":
            fast = 1

        if o == "--output":
            output_file = a
            try:
                output = open(output_file,'w')
                sys.stdout = output
            except Exception:
                log('Cannot write to outputfile: %s\n' % output_file, quiet)
                return(2)

        if o == "--slowdays":
            # limit slowdays to maximum supported by tvgids.nl
            slowdays = min(int(a),6)
            # slowdays implies fast == 0
            fast = 0

        if o == "--logos":
            logos = int(a)

        if o == "--clean_cache":
            clean_cache = 1
        if o == "--clear_cache":
            clear_cache = 1
        if o == "--cache":
            program_cache_file = a
        if o == "--max_overlap":
            max_overlap = int(a)
        if o == "--overlap_strategy":
            overlap_strategy = a
        if o == "--utc":
            use_utc = True

    # get configfile if available
    try:
        f = open(config_file,'rb')
    except IOError as e:
        if e.errno == 2:
            log('Config file %s not found.\n' % config_file)
            log('Re-run me with the --configure flag.\n')
        else:
            log('Config file %s: %s.\n' % (config_file, e.strerror))
        return(1)

    #check for cache
    program_cache = ProgramCache(program_cache_file)
    if clean_cache != 0:
        program_cache.clean()
    if clear_cache != 0:
        program_cache.clear()

    # Go!
    channels = {}

    # Read the channel stuff
    configencoding = 'utf-8'
    reconfigline = re.compile(r'#\s*(\w+):\s*(.+)')
    for byteline in f.readlines():
        try:
            line = byteline.decode(configencoding)
            line = line.lstrip()
            line = line.replace('\n','')
        except UnicodeError:
            log('Config file %s is not encoded in %s.\n' % (config_file, configencoding))
            return(1)
        if line [0] == '#':
            match = reconfigline.match(line)
            if match is not None and match.group(1) == "encoding":
                configencoding = match.group(2)
                try:
                    codecs.getencoder(configencoding)
                except LookupError:
                    log('Config file %s has invalid encoding %s.\n' % (config_file, configencoding))
                    return(1)
            continue
        else:
            channel = line.split(None, 1) # split on first whitespace
            channels[channel[0]] = channel[1]
    
    try:
        f.close()
    except IOError:
        pass

    # channels are now in channels dict keyed on channel id

    # print header stuff
    xmlencoding = 'UTF-8'
    xml = []
    xml.append('<?xml version="1.0" encoding="%s"?>' % xmlencoding)
    xml.append('<!DOCTYPE tv SYSTEM "xmltv.dtd">')
    xml.append('<tv generator-info-name="tv_grab_nl_py $Rev$">')

    # first do the channel info
    for key in channels.keys():
        xml.append('  <channel id="%s%s">' % (key, compat and '.tvgids.nl' or ''))
        xml.append('    <display-name lang="nl">%s</display-name>' % channels[key])
        if (logos):
            ikey = int(key)
            if ikey in logo_names:
                full_logo_url = logo_provider[logo_names[ikey][0]]+logo_names[ikey][1]+'.gif'
                xml.append('    <icon src="%s" />' % full_logo_url)
        xml.append('  </channel>')

    num_chans = len(channels.keys())
    channel_cnt = 0
    if program_cache != None:
        program_cache.clean()

    fluffy = channels.keys()
    nfluffy = len(fluffy)
    for id in fluffy:
        channel_cnt += 1
        log('\n\nNow fetching %s(xmltvid=%s%s) (channel %s of %s)\n' % \
                (channels[id], id, (compat and '.tvgids.nl' or ''), channel_cnt, nfluffy), quiet)
        info = get_channel_all_days(id,  days, quiet)
        programs = parse_programs(info, None, quiet)

        # fetch descriptions
        if not fast:
           get_descriptions(programs, program_cache, nocattrans, quiet, slowdays)
        
        # Split titles with colon in it
        # Note: this only takes place if all days retrieved are also grabbed with details (slowdays=days)
        # otherwise this function might change some titles after a few grabs and thus may result in
        # loss of programmed recordings for these programs.
        if slowdays == days:
            for program in programs:
               title_split(program)

        xml.extend(xmlefy_programs(programs, id, desc_len, compat, nocattrans, use_utc))

        # save the cache after each channel fetch 
        if program_cache != None:
            program_cache.dump(program_cache_file)
    
        # be nice to tvgids.nl
        
        time.sleep(random.randint(nice_time[0], nice_time[1]))
        if program_cache != None:
            program_cache.dump(program_cache_file)

    # print footer stuff
    xml.append("</tv>")

    # print result to stdout
    xml = "".join(xml)
    print(xml.encode('utf-8'))

    # close the outputfile if necessary
    if output != None:
        output.close()

    # and return success
    return(0)

# allow this to be a module
if __name__ == '__main__':
    sys.exit(main())