"""
parser.http.personParser module (imdb package).

This module provides the classes (and the instances), used to parse
the IMDb pages on the akas.imdb.com server about a person.
E.g., for "Mel Gibson" the referred pages would be:
    categorized:    http://akas.imdb.com/name/nm0000154/maindetails
    combined:       http://akas.imdb.com/name/nm0000154/filmoyear
    biography:      http://akas.imdb.com/name/nm0000154/bio
    ...and so on...

Copyright 2004 Davide Alberani <davide.alberani@erlug.linux.it>

This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.

This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
GNU General Public License for more details.

You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
"""

from imdb.Movie import Movie
from imdb.utils import analyze_name
from utils import ParserBase


def normalize_roles(str_roles):
    """Manage the list of roles for a given movie, as listed in the
    "combined" page for the given person.
    """
    # Split [roles] (or duties).
    s_roles = str_roles.replace(']', '')
    l = [i.replace(' .... ', '::') for i in s_roles.split('[')
            if i and not i.isspace()]
    # Split also '(notes)'.
    l = ['('.join(i.split('(')).strip() for i in l if i and not i.isspace()]
    l1 = []
    for i in l[:]:
        if i[0] == '(' and i[-1] == ')':
            l1.append(i[1:-1].strip())
            del l[0]
        else:
            break
    # Return a tuple like ([list of production status], [list of roles])
    return (l1, l)


class HTMLMaindetailsParser(ParserBase):
    """Parser for the "categorized" page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        cparser = HTMLMaindetailsParser()
        result = cparser.parse(categorized_html_string)
    """
    def _init(self):
        # This is the dictionary that will be returned by the parse() method.
        self.__person_data = {}
        
    def _reset(self):
        """Reset the parser."""
        self.__person_data.clear()
        self.__in_name = 0
        self.__name = ''
        self.__in_birth = 0
        self.__in_death = 0
        self.__birth = ''
        self.__death = ''

    def get_data(self):
        """Return the dictionary."""
        # Split birth/death date/notes.
        b = self.__birth.split('::')
        if b:
            b_date = b[0]
            del b[0]
            b_notes = ''.join(b)
            if b_date:
                self.__person_data.update({'birth date': b_date.strip()})
            if b_notes:
                self.__person_data.update({'birth notes': b_notes.strip()})
        d = self.__death.split('::')
        if d:
            d_date = d[0]
            del d[0]
            d_notes = ''.join(d)
            if d_date:
                self.__person_data.update({'death date': d_date.strip()})
            if d_notes:
                self.__person_data.update({'death notes': d_notes.strip()})
        return self.__person_data
    
    def start_title(self, attrs):
        self.__in_name = 1

    def end_title(self):
        self.__in_name = 0
        d = analyze_name(self.__name.strip())
        self.__person_data.update(d)

    def do_img(self, attrs):
        alt = self.get_attr_value(attrs, 'alt')
        src = self.get_attr_value(attrs, 'src')
        if alt and alt.lower() == 'headshot' and src:
            self.__person_data['headshot'] = src

    def do_br(self, attrs):
        # Birth/death date/notes are separated by a <br> tag.
        if self.__in_birth and self.__birth:
            self.__birth += '::'
        elif self.__in_death and self.__death:
            self.__death += '::'

    def start_dd(self, attrs): pass

    def end_dd(self):
        self.__in_birth = 0
        self.__in_death = 0

    def start_a(self, attrs): pass

    def end_a(self): pass

    def handle_data(self, data):
        sdata = data.strip()
        sldata = sdata.lower()
        if self.__in_name:
            self.__name += data
        elif self.__in_birth:
            if self.__birth and not self.__birth[-1].isspace():
                self.__birth += ' '
            self.__birth += sdata
        elif self.__in_death:
            if self.__death and not self.__death[-1].isspace():
                self.__death += ' '
            self.__death += sdata
        elif sldata.startswith('date of death'):
            self.__in_death = 1
        elif sldata.startswith('date of birth'):
            self.__in_birth = 1
    

class HTMLFilmoyearParser(ParserBase):
    """Parser for the "combined" page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        combparser = HTMLFilmoyearParser()
        result = combparser.parse(combined_html_string)
    """
    def _init(self):
        # This is the dictionary that will be returned by the parse() method.
        self.__movies_data = {}

    def _reset(self):
        """Reset the parser."""
        self.__movies_data.clear()
        self.__in_list = 0
        self.__in_title = 0
        self.__title = ''
        self.__roles = ''
        self.__last_imdbID = ''
        self.__in_tv = 0

    def get_data(self):
        """Return the dictionary."""
        return self.__movies_data

    def start_ol(self, attrs):
        self.__in_list = 1

    def end_ol(self):
        self.__in_list = 0
        self.__in_tv = 0

    def start_a(self, attrs):
        href = self.get_attr_value(attrs, 'href')
        # A movie title.
        if href and href.find('/title/tt') != -1:
            self.__in_title = 1
            imdbID = self.re_imdbID.findall(href)
            if imdbID:
                self.__last_imdbID = imdbID[-1]

    def end_a(self):
        self.__in_title = 0

    def start_li(self, attrs):
        self.__in_list = 1

    def end_li(self):
        if self.__title and self.__roles:
            movie = Movie(movieID=self.__last_imdbID, title=self.__title,
                            accessSystem='http')
            # TODO: check if roles like "voice" roles are handled properly.
            if not self.__in_tv:
                role_list = normalize_roles(self.__roles)
                # Update status information.
                status = '::'.join(role_list[0])
                if status:
                    movie.set_data({'status': status}, override=0)
                # Update role information.
                for role in role_list[1]:
                    # With a string like 'Actor::Himself', 'actor'
                    # (lowercase) is the name of the role (the section),
                    # while 'Himself' is the current role.
                    r_split = role.split('::')
                    r_name = r_split[0].lower()
                    del r_split[0]
                    # Sometimes the role name contains a ':', like:
                    # 'singer: " mine, mine, mine "'
                    rnl = r_name.split(':')
                    r_name = rnl[0]
                    # Leave ' " mine, mine, mine "' in the current role.
                    r_split += rnl[1:]
                    r_notes = ' '.join(r_split).strip()
                    # Strip optional parentheses.
                    if r_notes and r_notes[0] == '(' and r_notes[-1] == ')':
                        r_notes = r_notes[1:-1]
                    if not self.__movies_data.has_key(r_name):
                        self.__movies_data[r_name] = []
                    movie.currentRole = r_notes
                    self.__movies_data[r_name].append(movie)
            else:
                if not self.__movies_data.has_key('tv guest'):
                    self.__movies_data['tv guest'] = []
                movie.currentRole = self.__roles
                self.__movies_data['tv guest'].append(movie)
        self.__title = ''
        self.__roles = ''

    def do_br(self, attrs):
        self.__in_list = 0

    def handle_data(self, data):
        sldata = data.strip().lower()
        if sldata == 'notable tv guest appearances':
            self.__in_tv = 1
        elif self.__in_title:
            self.__title += data
        elif self.__in_list:
            if self.__roles and not self.__roles[-1].isspace():
                self.__roles += ' '
            self.__roles += data.strip()


class HTMLBioParser(ParserBase):
    """Parser for the "biography" page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        bioparser = HTMLBioParser()
        result = bioparser.parse(biography_html_string)
    """
    def _init(self):
        # This is the dictionary that will be returned by the parse() method.
        self.__bio_data = {}

    def _reset(self):
        """Reset the parser."""
        self.__bio_data.clear()
        self.__sect_name = ''
        self.__sect_data = ''
        self.__in_sect = 0
        self.__in_sect_name = 0

    def get_data(self):
        """Return the dictionary."""
        return self.__bio_data

    def start_a(self, attrs): pass

    def end_a(self): pass

    def start_dt(self, attrs):
        self.__in_sect = 1
        self.__in_sect_name = 1

    def end_dt(self):
        self.__in_sect_name = 0

    def start_dd(self, attrs): pass

    def end_dd(self):
        # Add a new section in the biography.
        if self.__sect_name and self.__sect_data:
            sect = self.__sect_name.strip().lower()
            # XXX: to get rid of the last colons.
            if sect[-1] == ':':
                sect = sect[:-1]
            data = self.__sect_data.strip()
            d_split = data.split('::')
            if len(d_split) == 1:
                self.__bio_data.update({sect: data})
            # Multiple items are added separately (e.g.: 'trivia' is
            # a list of strings).
            # FIXME: some items are string, some are list of strings,
            #        some sometimes a string and sometimes a list (e.g.:
            #        spouse); this should be fixed!  Always use lists?
            else:
                if not self.__bio_data.has_key(sect):
                    self.__bio_data[sect] = []
                for d in [x.strip() for x in d_split]:
                    if not d:
                        continue
                    self.__bio_data[sect].append(d)
        self.__sect_name = ''
        self.__sect_data = ''
        self.__in_sect = 0

    def start_p(self, attrs):
        if self.__in_sect:
            if self.__sect_data:
                self.__sect_data += '::'

    def end_p(self): pass

    def start_tr(self, attrs):
        if self.__in_sect:
            if self.__sect_data:
                if self.__sect_data[-1].isspace():
                    self.__sect_data = self.__sect_data.strip()
                self.__sect_data += '::'

    def end_tr(self): pass
    
    def handle_data(self, data):
        if self.__in_sect_name:
            self.__sect_name += data
        elif self.__in_sect:
            if not data.isspace() and self.__sect_data \
                    and self.__sect_data[-1].isspace():
                data = data.strip()
            self.__sect_data += data.replace('\n', ' ')


class HTMLOtherWorksParser(ParserBase):
    """Parser for the "other works" page of a given person.
    The page should be provided as a string, as taken from
    the akas.imdb.com server.  The final result will be a
    dictionary, with a key for every relevant section.

    Example:
        owparser = HTMLOtherWorksParser()
        result = owparser.parse(otherworks_html_string)
    """
    def _reset(self):
        """Reset the parser."""
        self.__in_ow = 0
        self.__ow = []
        self.__cow = ''

    def get_data(self):
        """Return the dictionary."""
        if not self.__ow: return {}
        return {'other works': self.__ow}

    def start_dd(self, attrs):
        self.__in_ow = 1

    def end_dd(self): pass

    def do_br(self, attrs):
        if self.__in_ow and self.__cow:
            self.__ow.append(self.__cow.strip())
            self.__cow = ''

    def start_dl(self, attrs): pass

    def end_dl(self):
        self.do_br([])
        self.__in_ow = 0
    
    def handle_data(self, data):
        if self.__in_ow:
            self.__cow += data


# The used instances.
maindetails_parser = HTMLMaindetailsParser()
filmography_parser = HTMLFilmoyearParser()
bio_parser = HTMLBioParser()
otherworks_parser = HTMLOtherWorksParser()
from movieParser import HTMLOfficialsitesParser
person_officialsites_parser = HTMLOfficialsitesParser()
from movieParser import HTMLAwardsParser
person_awards_parser = HTMLAwardsParser()
person_awards_parser.subject = 'name'

