from studio.dataHolder import TextData, DataBox, PageData
import re
import os
import xml.etree.ElementTree as ET

class HocrImporter:

    def __init__(self):
        self.file_path = None

    def import_file(self, file_path):
        self.file_path = file_path
        document = ET.parse(file_path)
        matches = re.match('({.*})html', document.getroot().tag)
        if matches:
            self.xmlns = matches.group(1)
        else:
            self.xmlns = '' 
        root_node = document.getroot()
        node = root_node.find('.//%sbody' % self.xmlns)
        return self._import_body(node)

    def _import_body(self, node):
        pages = []
        for node in node.findall('.//%sdiv' % self.xmlns):
            node_class = node.attrib.get('class')
            if node_class == 'ocr_page':
                node_title = node.attrib.get('title')
                if node_title == None:
                     continue
                values = self._string_values_to_dict(node_title, 'image', 1)
                image_path = None
                if values:
                    image_path = values[0]
                if image_path and not os.path.isabs(image_path):
                    image_path = os.path.join(os.path.dirname(self.file_path),
                                              image_path)
                if not image_path or not os.path.isfile(image_path):
                    continue
                page = PageData(image_path)
                page.data_boxes = self._import_clines(node)
                pages.append(page)
        return pages

    def _import_clines(self, node):
        data_boxes = []
        for node in node.findall('.//%sspan' % self.xmlns):
            node_class = node.attrib.get('class')
            if node_class == 'ocr_line':
                node_title = node.attrib.get('title')
                bbox_limits = []
                if node_title != None:
                    values = self._string_values_to_dict(node_title, 'bbox', 4)
                    bbox_limits = [int(value) for value in values]
                    if len(bbox_limits) < 4:
                        continue
                    x, y, x1, y1 = bbox_limits
                    bbox_limits = (x, y, x1 + 1 - x, y1 + 1 - y)
                    text = node.text
                    data_box = DataBox(*bbox_limits, text = text)
                    data_boxes.append(data_box)
        return data_boxes

    def _string_values_to_dict(self, text, label, n_args):
        tokens = text.split()
        tokens_len = len(tokens)
        for i in range(0, tokens_len):
            if tokens[i] == label:
                if i + n_args < tokens_len:
                    return tokens[i + 1: i + 1 + n_args]
        return []

if __name__ == '__main__':
    file_path = '/home/jay/work/ocropus/ocropus/data/testimages/align-input.hocr'
    importer = HocrImporter()
    importer.import_file(file_path)
