/*
Copyright (C) 2000-2010  Ministere de la culture et de la communication (France), AJLSM
See LICENCE file
*/
package fr.gouv.culture.sdx.document;

import fr.gouv.culture.sdx.exception.SDXException;
import fr.gouv.culture.sdx.exception.SDXExceptionCode;
import fr.gouv.culture.sdx.utils.Utilities;
import org.apache.excalibur.xml.sax.SAXParser;
import org.apache.cocoon.xml.XMLConsumer;
import org.apache.cocoon.xml.XMLUtils;
import org.apache.cocoon.xml.dom.DOMStreamer;
import org.w3c.tidy.Tidy;
import org.xml.sax.SAXException;

import java.io.*;

/**
 * An HTML document, parsable and indexable.
 */
public class HTMLDocument extends AbstractIndexableDocument implements ParsableDocument {

    private String MIMETYPE = "text/html";

    protected File tidyConf = null;

    /**
     * Creates an HTML document given an id.
     *
     * @param id The document's id.
     * If logging is desired the super.getLog() should be set after creation.
     * @see #enableLogging
     */
    public HTMLDocument(String id) throws SDXException {
        // We simply set the id.
        setId(id);
    }

    /**
     * Creates an HTML document.
     *
     * The document's id must be given later.
     *
     * If logging is desired the super.getLog() should be set after creation.
     * @see #enableLogging
     */
    public HTMLDocument() {
    }

    /** Starts the indexing process.
     *
     *@param parser     The parser to use
     *@param consumer   The consumer for the events generated by the indexing process
     */
    public void startIndexing(SAXParser parser, XMLConsumer consumer) throws SDXException {
        //verifying the consumer
        Utilities.checkXmlConsumer(super.getLog(), consumer);
        //resetting indexation data structures
        super.resetFields();
        parse(parser, consumer);
    }

    /**
     * Parses a document using the previously supplied consumer.
     *
     * @param   parser      The parser to use.
     */
    public void parse(SAXParser parser) throws SDXException {
        parse(parser, this.xmlConsumer);
    }

    /** Parses a document using a specific consumer.
     *
     *@param parser     The parser to use
     *@param consumer   The consumer of the events generated by the parse
     */
    public void parse(SAXParser parser, XMLConsumer consumer) throws SDXException {
        if (parser == null) {
            //no parser here
            String[] args = new String[1];
            //is this a bad idea, we need some way to give the user more information about which document parsing has failed resides
            if (this.getURL() != null) args[0] = this.getURL().toExternalForm();
            throw new SDXException(super.getLog(), SDXExceptionCode.ERROR_PARSER_NULL, args, null);
        } else {
            //verifying the consumer
            Utilities.checkXmlConsumer(super.getLog(), consumer);
            StringWriter stringWriter = new StringWriter();
            PrintWriter errorWriter = new PrintWriter(stringWriter);
            try {
                // Note : the following code is borrowed from Cocoon's HTMLGenerator class.
                // Setup an instance of Tidy.
                Tidy tidy = new Tidy();
                tidy.setXmlOut(true);
                tidy.setXHTML(true);
                tidy.setTidyMark(false);
                tidy.setXmlPi(true);
                tidy.setXmlPIs(true);

                tidy.setNumEntities(true);
                tidy.setDocType("omit");
                tidy.setBreakBeforeBR(true);
                tidy.setFixComments(true);


                tidy.setBreakBeforeBR(true);
                /* this a strong cleaning
                Word 2000 embeds <![if ...]> ... <![endif]> sequences
                */
                // tidy.setWord2000(true);
                //Set Jtidy warnings on-off
                if (super.getLog() != null) {
                    tidy.setShowWarnings(super.getLog().isWarnEnabled());
                    tidy.setQuiet(!super.getLog().isInfoEnabled());
                }

                //handling outside tidy configuration if explictly specified
                if (this.tidyConf != null && tidyConf.canRead())
                    tidy.setConfigurationFromFile(this.tidyConf.getCanonicalPath());
                // Get the errors into a String

                tidy.setErrout(errorWriter);


                // Extract the document using JTidy and stream it.
                org.w3c.dom.Document doc = tidy.parseDOM(new BufferedInputStream(this.openStream()), null);

                XMLUtils.stripDuplicateAttributes(doc, null);
                /*
                ByteArrayOutputStream bOut = new ByteArrayOutputStream();
                tidy.pprint(doc, bOut);
                bOut.close();
                //debugging
                // System.out.write(bOut.toByteArray());
                InputSource is = new InputSource(new ByteArrayInputStream(bOut.toByteArray()));
                // Jtidy doesn't warn or strip duplicate attributes in same tag; stripping.
                parser.parse(is, consumer);
                */



                // Then send the SAX events
                DOMStreamer streamer = new DOMStreamer(consumer);
                streamer.stream(doc);
            } catch (SAXException e) {
                //unable to parse
                String[] args = new String[2];
                //is this a bad idea, we need some way to give the user more information about which document parsing has failed resides
                if (this.getURL() != null) args[0] = this.getURL().toExternalForm();
                args[1] = e.getMessage();
                throw new SDXException(super.getLog(), SDXExceptionCode.ERROR_PARSE_DOC, args, e);
            } catch (IOException e) {
                //unable to parse
                String[] args = new String[2];
                //is this a bad idea, we need some way to give the user more information about which document parsing has failed resides
                if (this.getURL() != null) args[0] = this.getURL().toExternalForm();
                args[1] = e.getMessage();
                throw new SDXException(super.getLog(), SDXExceptionCode.ERROR_PARSE_DOC, args, e);
            } finally {
                // Write the errors
                errorWriter.flush();
                errorWriter.close();
                if (super.getLog() != null) if (super.getLog().isWarnEnabled()) super.getLog().warn(stringWriter.toString());
            }
        }
    }

    /**Gets the docType for the document*/
    public String getDocType() {
        return Document.DOCTYPE_HTML;
    }

    /**Set's the transformed document for the parent document.
     * The transformed document will have the same id and preferred
     * filename as the original.
     *
     * @param content   The byte array of data
     */
    public void setTransformedDocument(byte[] content) throws SDXException {
        if (content == null) {
            String[] args = new String[1];
            args[0] = this.getId();
            throw new SDXException(super.getLog(), SDXExceptionCode.ERROR_SET_TRANSFORMED_DOC, args, null);
        }
        IndexableDocument doc = new HTMLDocument();
        this.transformedDoc = doc;
        this.transformedDoc.enableLogging(super.getLog());
        this.transformedDoc.setContent(content);
        this.setUpTransformedDocument();
    }

    /**Set's the transformed document for the parent document.
     * The transformed document will have the same id and preferred
     * filename as the original.
     *
     * @param file      The transformed document file
     */
    public void setTransformedDocument(File file) throws SDXException {
        if (file == null) {
            String[] args = new String[1];
            args[0] = this.getId();
            throw new SDXException(super.getLog(), SDXExceptionCode.ERROR_SET_TRANSFORMED_DOC, args, null);
        }
        IndexableDocument doc = new HTMLDocument();
        this.transformedDoc = doc;
        this.transformedDoc.enableLogging(super.getLog());
        this.transformedDoc.setContent(file);
        this.setUpTransformedDocument();
    }

    /**Returns the mimeType field (A String) for this document*/
    public String getMimeType() {
        return this.MIMETYPE;
    }

    public void setTidyConfiguration(File tidyConf) {
        this.tidyConf = tidyConf;
    }

    /**Some additional system fields adding to the Lucene document*/
    public void addAdditionalSystemFields(org.apache.lucene.document.Document doc) {
      //Nothing here
    }

}
