#!/usr/bin/perl
# BEGIN BPS TAGGED BLOCK {{{
#
# COPYRIGHT:
#
# This software is Copyright (c) 1996-2011 Best Practical Solutions, LLC
#                                          <sales@bestpractical.com>
#
# (Except where explicitly superseded by other copyright notices)
#
#
# LICENSE:
#
# This work is made available to you under the terms of Version 2 of
# the GNU General Public License. A copy of that license should have
# been provided with this software, but in any event can be snarfed
# from www.gnu.org.
#
# This work is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 or visit their web page on the internet at
# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
#
#
# CONTRIBUTION SUBMISSION POLICY:
#
# (The following paragraph is not intended to limit the rights granted
# to you to modify and distribute this software under the terms of
# the GNU General Public License and is only of importance to you if
# you choose to contribute your changes and enhancements to the
# community by submitting them to Best Practical Solutions, LLC.)
#
# By intentionally submitting any modifications, corrections or
# derivatives to this work, or any other work intended for use with
# Request Tracker, to Best Practical Solutions, LLC, you confirm that
# you are the copyright holder for those contributions and you grant
# Best Practical Solutions,  LLC a nonexclusive, worldwide, irrevocable,
# royalty-free, perpetual, license to use, copy, create derivative
# works based on those contributions, and sublicense and distribute
# those contributions and any derivatives thereof.
#
# END BPS TAGGED BLOCK }}}
use strict;
use warnings;
no warnings 'once';

# fix lib paths, some may be relative
BEGIN {
    require File::Spec;
    my @libs = ("/usr/local/libdata/perl5/site_perl", "/usr/local/libdata/perl5/site_perl");
    my $bin_path;

    for my $lib (@libs) {
        unless ( File::Spec->file_name_is_absolute($lib) ) {
            unless ($bin_path) {
                if ( File::Spec->file_name_is_absolute(__FILE__) ) {
                    $bin_path = ( File::Spec->splitpath(__FILE__) )[1];
                }
                else {
                    require FindBin;
                    no warnings "once";
                    $bin_path = $FindBin::Bin;
                }
            }
            $lib = File::Spec->catfile( $bin_path, File::Spec->updir, $lib );
        }
        unshift @INC, $lib;
    }
}

BEGIN {
    use RT;
    RT::LoadConfig();
    RT::Init();
};
use RT::Interface::CLI ();

my %OPT = (
    help        => 0,
);
my @OPT_LIST = qw(help|h!);

my $db_type = RT->Config->Get('DatabaseType');
if ( $db_type eq 'Pg' ) {
    %OPT = (
        %OPT,
        limit  => 0,
    );
    push @OPT_LIST, 'limit=i';
}
elsif ( $db_type eq 'mysql' ) {
    %OPT = (
        %OPT,
        limit  => 0,
    );
    push @OPT_LIST, 'limit=i';
}
elsif ( $db_type eq 'Oracle' ) {
    %OPT = (
        %OPT,
        memory => '2M',
    );
    push @OPT_LIST, qw(memory=s);
}

use Getopt::Long qw(GetOptions);
GetOptions( \%OPT, @OPT_LIST );

if ( $OPT{'help'} ) {
    RT::Interface::CLI->ShowHelp(
        Sections => 'NAME|DESCRIPTION|'. uc($db_type),
    );
}

my $fts_config = RT->Config->Get('FullTextSearch') || {};
unless ( $fts_config->{'Enable'} ) {
    print STDERR "Full text search disabled in the RT config."
        ." Read documentation for %FullTextSearch config option.";
    exit 1;
}
unless ( $fts_config->{'Indexed'} ) {
    print STDERR "Full text search is enabled in the RT config,"
        ." however full text search works without special index,"
        ." so this tool is not required."
        ." Read documentation for %FullTextSearch config option.";
    exit 1;
}

if ( $db_type eq 'Oracle' ) {
    my $index = $fts_config->{'IndexName'} || 'rt_fts_index';
    $RT::Handle->dbh->do(
        "begin ctx_ddl.sync_index(?, ?); end;", undef,
        $index, $OPT{'memory'}
    );
    exit;
}

my @types = qw(text html);
foreach my $type ( @types ) {
    my $attaches = attachments($type);
    $attaches->Limit(
        FIELD => 'id',
        OPERATOR => '>',
        VALUE => last_indexed($type)
    );
    $attaches->OrderBy( FIELD => 'id', ORDER => 'asc' );
    $attaches->RowsPerPage( $OPT{'limit'} || 100 );

    my $found = 0;
    while ( my $a = $attaches->Next ) {
        print "bla\n";
        debug("Found attachment #". $a->id );
        next if filter( $type, $a );
        debug("Attachment #". $a->id ." hasn't been filtered" );
        my $txt = extract($type, $a) or next;
        debug("Extracted text from attachment #". $a->id );
        $found++;
        process( $type, $a, $txt );
        debug("Processed attachment #". $a->id );
    }
    finalize( $type, $attaches ) if $found;
    clean( $type );
}

sub attachments {
    my $type = shift;
    my $res = RT::Attachments->new( RT->SystemUser );

    my $txn_alias = $res->JoinTransactions;
    $res->Limit( ALIAS => $txn_alias, FIELD => 'ObjectType', VALUE => 'RT::Ticket' );
    my $ticket_alias = $res->Join(
        ALIAS1 => $txn_alias, FIELD1 => 'ObjectId',
        TABLE2 => 'Tickets', FIELD2 => 'id'
    );
    $res->Limit(
        ALIAS => $ticket_alias,
        FIELD => 'Status',
        OPERATOR => '!=',
        VALUE => 'deleted'
    );

    return goto_specific(
        suffix => $type,
        error => "Don't know how to find $type attachments",
        arguments => [$res],
    );
}

sub last_indexed {
    my ($type) = (@_);
    return goto_specific(
        suffix => $db_type,
        error => "Don't know how to find last indexed $type attachment for $db_type DB",
        arguments => \@_,
    );
}

sub filter {
    my $type = shift;
    return goto_specific(
        suffix    => $type,
        arguments => \@_,
    );
}

sub extract {
    my $type = shift;
    return goto_specific(
        suffix    => $type,
        error     => "No way to convert $type attachment into text",
        arguments => \@_,
    );
}

sub process {
    return goto_specific(
        suffix    => $db_type,
        error     => "No processer for $db_type DB",
        arguments => \@_,
    );
}

sub finalize {
    return goto_specific(
        suffix    => $db_type,
        arguments => \@_,
    );
}

sub clean {
    return goto_specific(
        prefix    => $db_type,
        arguments => \@_,
    );
}

{
sub last_indexed_mysql {
    my $type = shift;
    my $attr = $RT::System->FirstAttribute('LastIndexedAttachments');
    return 0 unless $attr;
    return 0 unless exists $attr->{ $type };
    return $attr->{ $type } || 0;
}

sub process_mysql {
    my ($type, $attachment, $text) = (@_);

    my $doc = sphinx_template();

    my $element = $doc->createElement('sphinx:document');
    $element->setAttribute( id => $attachment->id );
    $element->appendTextChild( content => $$text );

    $doc->documentElement->appendChild( $element );
}

my $doc = undef;
sub sphinx_template {
    return $doc if $doc;

    require XML::LibXML;
    $doc = XML::LibXML::Document->new('1.0', 'UTF-8');
    my $root = $doc->createElement('sphinx:docset');
    $doc->setDocumentElement( $root );

    my $schema = $doc->createElement('sphinx:schema');
    $root->appendChild( $schema );
    foreach ( qw(content) ) {
        my $field = $doc->createElement('sphinx:field');
        $field->setAttribute( name => $_ );
        $schema->appendChild( $field );
    }

    return $doc;
}

sub finalize_mysql {
    my ($type, $attachments) = @_;
    sphinx_template()->toFH(*STDOUT, 1);
}

sub clean_mysql {
    $doc = undef;
}

}

sub last_indexed_pg {
    my $type = shift;
    my $attachments = attachments( $type );
    my $alias = 'main';
    if ( $fts_config->{'Table'} && lc $fts_config->{'Table'} ne 'Attachments' ) {
        $alias = $attachments->Join(
            TYPE    => 'left',
            FIELD1 => 'id',
            TABLE2  => $fts_config->{'table'},
            FIELD2 => 'id',
        );
    }
    $attachments->Limit(
        ALIAS => $alias,
        FIELD => $fts_config->{'Column'},
        OPERATOR => 'IS NOT',
        VALUE => 'NULL',
    );
    $attachments->OrderBy( FIELD => 'id', ORDER => 'desc' );
    my $res = $attachments->First;
    return 0 unless $res;
    return $res->id;
}

sub process_pg {
    my ($type, $attachment, $text) = (@_);

    my $dbh = $RT::Handle->dbh;
    my $table = $fts_config->{'Table'};
    my $column = $fts_config->{'Column'};

    my $query;
    if ( $table ) {
        if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
            $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?";
        } else {
            $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
        }
    } else {
        $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
    }

    my $status = $dbh->do( $query, undef, $$text, $attachment->id );
    unless ( $status ) {
        die "error: ". $dbh->errstr;
    }
}

sub attachments_text {
    my $res = shift;
    $res->Limit( FIELD => 'content_type', VALUE => 'text/plain' );
    return $res;
}

sub extract_text {
    my $attachment = shift;
    my $text = $attachment->Content;
    return undef unless defined $text && length($text);
    return \$text;
}

sub attachments_html {
    my $res = shift;
    $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' );
    return $res;
}

sub filter_html {
    my $attachment = shift;
    if ( my $parent = $attachment->Parent ) {
# skip html parts that are alternatives
        return 1 if $parent->id
            && $parent->ContentType eq 'mulitpart/alternative';
    }
    return 0;
}

sub extract_html {
    my $attachment = shift;
    my $text = $attachment->Content;
    return undef unless defined $text && length($text);
# TODO: html -> text
    return \$text;
}

sub goto_specific {
    my %args = (@_);

    my $func = (caller(1))[3];
    $func =~ s/.*:://;
    my $call = $func ."_". lc $args{'suffix'};
    unless ( defined &$call ) {
        return undef unless $args{'error'};
        require Carp; Carp::croak( $args{'error'} );
    }
    @_ = @{ $args{'arguments'} };
    goto &$call;
}


# helper functions
sub verbose  { print _(@_), "\n" if $OPT{verbose} || $OPT{verbose}; 1 }
sub debug    { print _(@_), "\n" if $OPT{debug}; 1 }
sub error    { $RT::Logger->error(_(@_)); verbose(@_); 1 }
sub warning  { $RT::Logger->warn(_(@_)); verbose(@_); 1 }

=head1 NAME

rt-fulltext-indexer - Indexer for full text search

=head1 DESCRIPTION

This is a helper script to keep full text index in sync with data.

Start from F<sbin/rt-setup-fulltext-index>, adjust config and run
this script via cron.

=head1 ORACLE

=head2 USAGE

    rt-fulltext-indexer --help
    rt-fulltext-indexer
    rt-fulltext-indexer --memory 10M

=head2 DESCRIPTION

Basicly this script runs the following query:

    begin
    ctx_ddl.sync_index('rt_fts_index', '2M');
    end;

Ammount of memory used for the sync can be controlled with --memory option.

There is way to do all of this on Oracle side by setting up a DBMS_JOB. Read
"Managing DML Operations for a CONTEXT Index" chapter from "Text Application
Developer's Guide" for more info on the topic and how to keep index optimized,
collect garbage and other stuff.

=head1 PG

=head2 USAGE

    rt-fulltext-indexer --help
    rt-fulltext-indexer
    rt-fulltext-indexer --limit 100

=head2 DESCRIPTION

This script find attachments that should be indexed and process content if required
and stores in the index. Use --limit option to specify how many attachments to process.

=head1 MYSQL

=head2 USAGE

    rt-fulltext-indexer --help
    rt-fulltext-indexer
    rt-fulltext-indexer --limit 100

=head2 DESCRIPTION

This script find attachments that should be indexed and process content if required
and generates xmlpipe2 document stream that can processed by sphinx. Read more in
sphinx reference manual.

Use --limit option to specify how many attachments to process.

=cut

