#!/usr/local/bin/python2.7

"""
yle-dl - Frontend for rtmpdump-yle, the YLE Areena stream downloader

Copyright (C) 2010, 2011 Antti Ajanki <antti.ajanki@iki.fi>

This script extracts stream information from a YLE Areena web page and
calls rtmpdump-yle with correct parameters.
"""

import sys
import urllib
import urllib2
import re
import subprocess
import os
import signal
import urlparse
import htmlentitydefs
import json
import string

RTMPDUMPYLE_BINARY = '/usr/local/bin/rtmpdump-yle'  # @BINARY@ make install replaces this line with the real path
AREENA_RTMP = 'rtmp://flashu.yle.fi/AreenaServer'
AREENA_SWF = 'http://areena.yle.fi/player/Application.swf?build=2'
RTMPDUMPYLE_OPTIONS = ['-r', AREENA_RTMP, '-s', AREENA_SWF, '-m', '60']
ARKISTO_SWF = 'http://yle.fi/elavaarkisto/flowplayer/flowplayer.commercial-3.2.7.swf?0.739011391531676'
RTMPDUMPYLE_OPTIONS_ARKISTO = ['-s', ARKISTO_SWF, '-m', '60']
RTMPDUMPYLE_OPTIONS_YLEX = ['-m', '60', '--areenaParams', '']
HTTP_HEADERS = {'User-Agent': 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.1) Gecko/20090624 Firefox/3.5',
                'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
                'Accept-Language': 'fi,en-us;q=0.5',
                'Accept-Charset': 'ISO-88591-1,utf-8;q=0.7,*;q=0.7',
                'Connection': 'close',
                'Referer': 'http://areena.yle.fi/'}

# list of all options that require an argument
ARGOPTS = ('--rtmp', '-r', '--host', '-n', '--port', '-c', '--socks',
           '-S', '--swfUrl', '-s', '--tcUrl', '-t', '--pageUrl', '-p',
           '--app', '-a', '--swfhash', '-w', '--swfsize', '-x', '--swfVfy',
           '-W', '--swfAge', '-X', '--auth', '-u', '--conn', '-C',
           '--flashVer', '-f', '--subscribe', '-d', '--flv', '-o',
           '--timeout', '-m', '--start', '-A', '--stop', '-B', '--token',
           '-T', '--skip', '-k', '--areenaParams', '--ylePassi', '--destdir')

debug = False
excludechars_linux = '*/|'
excludechars_windows = '\"*/:<>?|'
excludechars = excludechars_linux

def usage():
    """Print the usage message to stderr"""
    print >> sys.stderr, "Usage:"
    print >> sys.stderr, "%s [yle-dl or rtmpdump options] URL" % sys.argv[0]
    print >> sys.stderr, ""
    print >> sys.stderr, "yle-dl options:"
    print >> sys.stderr, ""
    print >> sys.stderr, "--episodes              Download all episodes from the given YLE Areena page"
    print >> sys.stderr, "--latestepisode         Download the latest episode"
    print >> sys.stderr, "--showurl               Print librtmp-compatible URL, don't download"
    print >> sys.stderr, "--vfat                  Create Windows-compatible filenames"
    print >> sys.stderr, ""
    print >> sys.stderr, "rtmpdump options:"
    print >> sys.stderr, ""
    subprocess.call([RTMPDUMPYLE_BINARY, '--help'])

def download_page(url):
    """Returns contents of a HTML page at url."""
    if url.find('://') == -1:
        url = 'http://' + url

    request = urllib2.Request(url, headers=HTTP_HEADERS)
    try:
        urlreader = urllib2.urlopen(request)
        charset = urlreader.info().getparam('charset')
        if charset is None:
            charset = 'iso-8859-1'

        return unicode(urlreader.read(), charset, 'replace')
    except urllib2.URLError, exc:
        print >> sys.stderr, "Can't read %s: %s" % (url, str(exc.reason))
        return None
    except ValueError:
        print >> sys.stderr, 'Invalid URL: ' + url
        return None

def encode_url_utf8(url):
    """Encode the path component of url to percent-encoded UTF8."""
    (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url)

    path = path.encode('UTF8')

    # Assume that the path is already encoded if there seems to be
    # percent encoded entities.
    if re.search(r'%[0-9A-Fa-f]{2}', path) is None:
        path = urllib.quote(path, '/+')

    return urlparse.urlunparse((scheme, netloc, path, params, query, fragment))

def decode_html_entity(entity):
    if not entity:
        return u''

    try:
        x = htmlentitydefs.entitydefs[entity]
    except KeyError:
        x = entity

    if x.startswith('&#') and x[-1] == ';':
        x = x[1:-1]

    if x[0] == '#':
        try:
            return unichr(int(x[1:]))
        except (ValueError, OverflowError):
            return u'?'
    else:
        return unicode(x, 'iso-8859-1', 'ignore')

def replace_entitydefs(content):
    return re.sub(r'&(.*?);', lambda m: decode_html_entity(m.group(1)), content)

def sane_filename(name):
    if isinstance(name, unicode):
        tr = dict((ord(c), ord(u'_')) for c in excludechars)
    else:
        tr = string.maketrans(excludechars, '_'*len(excludechars))
    x = name.strip(' .').translate(tr)
    if x:
        return x
    else:
        return 'ylevideo'

def execute_rtmpdump(args):
    """Start rtmpdump-yle process with argument list args and wait
    until completion."""
    if debug:
        print >> sys.stderr, 'Executing:'
        print >> sys.stderr, ' '.join(args)

    try:
        rtmpdump_process = subprocess.Popen(args)
        return rtmpdump_process.wait()
    except KeyboardInterrupt:
        os.kill(rtmpdump_process.pid, signal.SIGINT)
        rtmpdump_process.wait()
        return 2
    except OSError, exc:
        print >> sys.stderr, "Execution failed:", exc
        return 2

def downloader_factory(url):
    if url.startswith('http://www.yle.fi/elavaarkisto/') or \
            url.startswith('http://yle.fi/elavaarkisto/'):
        return ElavaArkistoDownloader()
    elif url.startswith('http://ylex.yle.fi/'):
        return YleXDownloader()
    else:
        return AreenaDownloader()


### Areena ###


class AreenaDownloader:

    def download_single_episode(self, url, parameters):
        """Extracts Areena player params from a web page at url and
        starts a rtmpdump-yle process with additional parameters."""
        areenaparams = self.download_params(url)
        if not areenaparams:
            return 1

        args = [RTMPDUMPYLE_BINARY]
        args += RTMPDUMPYLE_OPTIONS
        args += ['--pageUrl', url]
        args += parameters
        args += ['--areenaParams', areenaparams]

        return execute_rtmpdump(args)

    def download_episodes(self, url, parameters, episodes, latest_only):
        """Extract all episodes (or just the latest episode if
        latest_only is True) from url."""
        if not episodes:
            return self.download_single_episode(url, parameters)

        episodelist = self.get_playlist(url, latest_only)
        if episodelist is None:
            return 1

        if len(episodelist) > 1:
            print >> sys.stderr, 'Downloading %d episodes' % len(episodelist)

        for episodeurl in episodelist:
            print >> sys.stderr, 'Downloading episode at ' + episodeurl
            status = self.download_single_episode(
                       episodeurl, parameters + ['--noOverwrite'])
            if status != 0:
                return status

        return 0

    def download_params(self, url):
        """Download a web page at url and return Areena parameters
        extracted from it."""
        html = download_page(url)
        if html is None:
            return None

        areenaparams = self.extract_params(html)
        if areenaparams is None:
            print >> sys.stderr, "Can't find AreenaPlayerParams from the page."
            print >> sys.stderr, "Is %s really a YLE Areena video page?" % url
            return None

        return areenaparams

    def extract_params(self, html):
        m = re.search(r'<div class="AreenaPlayerParams">([^<]*)</div>', html)
        if m is None:
            return None
        else:
            return m.group(1)

    def get_playlist(self, areenaurl, latest_only):
        episodelist = self.get_episode_pages(areenaurl)
        if episodelist is None:
            return None

        if latest_only:
            episodelist = episodelist[:1]

        return episodelist

    def get_episode_pages(self, url):
        if debug:
            print >> sys.stderr, 'Searching for episodes in %s' % url

        html = download_page(url)
        if html is None:
            return None

        m = re.search(r'<a href="(.*?)">Tilaa (?:RSS|uusimmat)</a>', html)
        if m is None:
            print >> sys.stderr, "No RSS link in %s" % url
            return []

        rssurl = urlparse.urljoin(url, m.group(1))

        if debug:
            print >> sys.stderr, 'Getting episode RSS %s' % rssurl

        rss = download_page(rssurl)

        i = rss.find('<item>')
        if i == -1:
            return []
        rss = rss[i:]

        pages = re.findall(r'<link>(.*?)</link>', rss)
        if pages is None:
            return []
        else:
            return pages

    def print_librtmp_url(self, areenaurl):
        """Extract Areena parameters from the given URL and print
        a librtmp-compatible URL to stdout."""
        areenaparams = self.download_params(areenaurl)
        if not areenaparams:
            return 1

        print '%s swfUrl=%s pageUrl=%s areenaParams=%s' % \
            (AREENA_RTMP, AREENA_SWF, areenaurl, areenaparams)

        return 0

    def print_urls(self, areenaurl, episodes, latest_only):
        """Print librtmp-compatible URL for the stream at areenaurl
        (if episodes is false) or for all episodes found at areenaurl
        (if episodes if true)."""
        if not episodes:
            return self.print_librtmp_url(areenaurl)

        episodelist = self.get_playlist(areenaurl, latest_only)
        if episodelist is None:
            return 0

        for episodeurl in episodelist:
            self.print_librtmp_url(episodeurl)

        return 0


### Elava Arkisto ###


class ElavaArkistoDownloader:

    def extract_playlist(self, mediajson):
        pagedata = json.loads(mediajson)
        if not pagedata.has_key('media'):
            return []

        clips = []
        for mediaitem in pagedata['media']:
            title = sane_filename(mediaitem.get('title', 'elavaarkisto'))

            downloadURL = mediaitem.get('downloadURL', None)

            bestrate = 0
            bestrtmpurl = ''
            for clip in mediaitem.get('urls', {}).get('domestic', []):
                rate = float(clip.get('bitrate', 0))
                url = clip.get('url', '')
                if rate > bestrate and url:
                    bestrate = rate
                    bestrtmpurl = url

            if not bestrtmpurl:
                continue

            # YLE server requires that app is the first path component
            # only. By default librtmp would take the first two
            # components (app/appInstance).
            #
            # This also means that we can't rely on librtmp's playpath
            # parser and have to duplicate the logic here.
            k = 0
            for i, x in enumerate(bestrtmpurl):
                if x == '/':
                    k += 1
                    if k == 4:
                        break

            playpath = bestrtmpurl[(i+1):]
            bestrtmpurl = bestrtmpurl[:i]

            ext = os.path.splitext(playpath)[1]
            if ext == '.mp4':
                playpath = 'mp4:' + playpath
                ext = '.flv'
            elif ext == '.mp3':
                playpath = 'mp3:' + playpath[:-4]

            clips.append({'rtmp': bestrtmpurl, 
                          'playpath': playpath,
                          'downloadURL': downloadURL,
                          'filename': title + ext})
            
        return clips

    def download_single_episode(self, rtmpurl, playpath, downloadURL,
                                filename, parameters, pageurl):
        enc = sys.getfilesystemencoding()

        if downloadURL:
            print 'Downloading from HTTP server...'
            try:
                urllib.urlretrieve(downloadURL, filename.encode(enc))
            except IOError, exc:
                print >> sys.stderr, "Download failed :", exc
                return 2
            print 'Stream saved to', filename.encode(enc)
            return 0
        else:
            args = [RTMPDUMPYLE_BINARY]
            args += RTMPDUMPYLE_OPTIONS_ARKISTO
            args += ['-r', rtmpurl.encode(enc),
                     '-y', playpath.encode(enc),
                     '-p', pageurl.encode(enc),
                     '-o', filename.encode(enc, 'replace')]
            args += parameters

            return execute_rtmpdump(args)

    def print_librtmp_url(self, rtmpurl, playpath, pageurl, downloadURL):
        """Print a librtmp-compatible Elava Arkisto URL to stdout."""
        if downloadURL:
            print downloadURL
        else:
            print '%s playpath=%s swfUrl=%s pageUrl=%s' % \
                (rtmpurl, playpath, ARKISTO_SWF, pageurl)
        return 0

    def get_playlist(self, url, latest_episode):
        (scheme, netloc, path, params, query, fragment) = urlparse.urlparse(url)

        if '.' in path:
            path = path.rsplit('.', 1)[0]
        path = path + '.json'
        jsonurl = urlparse.urlunparse((scheme, netloc, path, '', '', ''))
        
        mediajson = download_page(jsonurl)
        if mediajson is None:
            return None

        # Yle server sends UTF-8 but doesn't set charset in
        # Content-type header. This will workaround the problem.
        mediajson = mediajson.encode('iso-8859-1').decode('utf-8')

        playlist = self.extract_playlist(mediajson)
        if len(playlist) == 0:
            print >> sys.stderr, "Can't find streams at %s." % url
            return None

        if latest_episode:
            playlist = playlist[:1]

        return playlist

    def download_episodes(self, url, parameters, episodes, latest_episode):
        """Download playlist from Elava Arkisto page at url and
        download all clips using rtmpdump-yle ."""
        playlist = self.get_playlist(url, latest_episode)
        if playlist is None:
            return 1

        for clip in playlist:
            status = self.download_single_episode(clip['rtmp'],
                                                  clip['playpath'],
                                                  clip['downloadURL'],
                                                  clip['filename'],
                                                  parameters, url)
            if status != 0:
                return status

        return 0

    def print_urls(self, url, episodes, latest_episode):
        """Download playlist from Elava Arkisto page at url and print
        a librtmp-compatible URL for each clip."""
        playlist = self.get_playlist(url, latest_episode)
        if playlist is None:
            return 1

        for clip in playlist:
            self.print_librtmp_url(clip['rtmp'], clip['playpath'],
                                   url, clip['downloadURL'])

        return 0


### YleX Areena ###


class YleXDownloader:

    def get_params(self, url):
        html = download_page(url)
        if not html:
            return None

        match = re.search(r'<h1[^>]*>(.*?)</h1>', html)
        if match:
            filename = sane_filename(replace_entitydefs(match.group(1))) + '.flv'
        else:
            filename = 'ylex.flv'

        match = re.search(r'jQuery.extend\([^,]*,(.*)\)', html)
        if not match:
            return None

        ylexdata = json.loads(match.group(1).strip())

        enc = sys.getfilesystemencoding()
        try:
            clip = ylexdata['YlexAreena']['clip'][0]
            swf = 'http://ylex.yle.fi' + ylexdata['YlexFlowplayer']['src']
            return [
              '-r', clip['plugins']['bwcheck']['hosts'][0]['host'].encode(enc),
              '-y', clip['clip']['url'].encode(enc),
              '-s', swf.encode(enc),
              '-p', url.encode(enc),
              '-o', filename.encode(enc, 'replace')]
        except (KeyError, IndexError):
            return None

    def download_episodes(self, url, argv, episodes, latest_episode):
        """Download a stream from the given YleX Areena url using
        rtmpdump-yle."""
        params = self.get_params(url)
        if not params:
            return 1

        args = [RTMPDUMPYLE_BINARY]
        args += RTMPDUMPYLE_OPTIONS_YLEX
        args += params
        args += argv

        return execute_rtmpdump(args)

    def print_urls(self, url, episodes, latest_episode):
        """Print a librtmp-compatible YleX Arkisto URL to stdout."""
        params = self.get_params(url)
        if not params:
            return 1

        x = dict((params[i], params[i+1]) for i in xrange(0, len(params), 2))

        print '%s playpath=%s areenaParams= swfUrl=%s pageUrl=%s' % \
            (x['-r'], x['-y'], x['-s'], x['-p'])

        return 0


### main program ###

    
def main():
    global debug
    episodes = False
    latest_episode = False
    url_only = False
    argv = sys.argv[1:]

    url = None
    prevarg = ''
    for arg in argv:
        if not arg.startswith('-') and prevarg not in ARGOPTS:
            url = arg
        elif arg in ['--verbose', '-V', '--debug', '-z']:
            debug = True
        elif arg in ['--episodes']:
            episodes = True
        elif arg in ['--latestepisode']:
            latest_episode = True
            episodes = True
        elif arg == '--showurl':
            url_only = True
        elif arg == '--vfat':
            global excludechars
            global excludechars_windows
            excludechars = excludechars_windows
        prevarg = arg
    if '--episodes' in argv:
        argv.remove('--episodes')
    if '--latestepisode' in argv:
        argv.remove('--latestepisode')
            
    if url is None or '--help' in argv:
        usage()
        sys.exit(1)

    argv.remove(url)

    # Is sys.getfilesystemencoding() the correct encoding for
    # sys.argv?
    encoding = sys.getfilesystemencoding()
    try:
        url = unicode(url, encoding)
    except UnicodeDecodeError:
        print >> sys.stderr, 'Warning: Failed to encode URL!'
        url = unicode(url, 'ascii', 'replace')

    url = encode_url_utf8(url)
    dl = downloader_factory(url)

    if url_only:
        sys.exit(dl.print_urls(url, episodes, latest_episode))
    else:
        sys.exit(dl.download_episodes(url, argv, episodes, latest_episode))


if __name__ == '__main__':
    main()
