#!/usr/bin/perl
# BEGIN BPS TAGGED BLOCK {{{
#
# COPYRIGHT:
#
# This software is Copyright (c) 1996-2014 Best Practical Solutions, LLC
#                                          <sales@bestpractical.com>
#
# (Except where explicitly superseded by other copyright notices)
#
#
# LICENSE:
#
# This work is made available to you under the terms of Version 2 of
# the GNU General Public License. A copy of that license should have
# been provided with this software, but in any event can be snarfed
# from www.gnu.org.
#
# This work is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
# General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA
# 02110-1301 or visit their web page on the internet at
# http://www.gnu.org/licenses/old-licenses/gpl-2.0.html.
#
#
# CONTRIBUTION SUBMISSION POLICY:
#
# (The following paragraph is not intended to limit the rights granted
# to you to modify and distribute this software under the terms of
# the GNU General Public License and is only of importance to you if
# you choose to contribute your changes and enhancements to the
# community by submitting them to Best Practical Solutions, LLC.)
#
# By intentionally submitting any modifications, corrections or
# derivatives to this work, or any other work intended for use with
# Request Tracker, to Best Practical Solutions, LLC, you confirm that
# you are the copyright holder for those contributions and you grant
# Best Practical Solutions,  LLC a nonexclusive, worldwide, irrevocable,
# royalty-free, perpetual, license to use, copy, create derivative
# works based on those contributions, and sublicense and distribute
# those contributions and any derivatives thereof.
#
# END BPS TAGGED BLOCK }}}
use strict;
use warnings;
no warnings 'once';

# fix lib paths, some may be relative
BEGIN {
    require File::Spec;
    my @libs = ("/usr/local/libdata/perl5/site_perl", "/usr/local/libdata/perl5/site_perl");
    my $bin_path;

    for my $lib (@libs) {
        unless ( File::Spec->file_name_is_absolute($lib) ) {
            unless ($bin_path) {
                if ( File::Spec->file_name_is_absolute(__FILE__) ) {
                    $bin_path = ( File::Spec->splitpath(__FILE__) )[1];
                }
                else {
                    require FindBin;
                    no warnings "once";
                    $bin_path = $FindBin::Bin;
                }
            }
            $lib = File::Spec->catfile( $bin_path, File::Spec->updir, $lib );
        }
        unshift @INC, $lib;
    }
}

BEGIN {
    use RT;
    RT::LoadConfig();
    RT::Init();
};
use RT::Interface::CLI ();

my %OPT = (
    help        => 0,
    debug       => 0,
    quiet       => 0,
);
my @OPT_LIST = qw(help|h! debug! quiet);

my $db_type = RT->Config->Get('DatabaseType');
if ( $db_type eq 'Pg' ) {
    %OPT = (
        %OPT,
        limit  => 0,
        all    => 0,
    );
    push @OPT_LIST, 'limit=i', 'all!';
}
elsif ( $db_type eq 'mysql' ) {
    %OPT = (
        %OPT,
        limit    => 0,
        all      => 0,
        xmlpipe2 => 0,
    );
    push @OPT_LIST, 'limit=i', 'all!', 'xmlpipe2!';
}
elsif ( $db_type eq 'Oracle' ) {
    %OPT = (
        %OPT,
        memory => '2M',
    );
    push @OPT_LIST, qw(memory=s);
}

use Getopt::Long qw(GetOptions);
GetOptions( \%OPT, @OPT_LIST );

if ( $OPT{'help'} ) {
    RT::Interface::CLI->ShowHelp(
        Sections => 'NAME|DESCRIPTION|'. uc($db_type),
    );
}

use Fcntl ':flock';
if ( !flock main::DATA, LOCK_EX | LOCK_NB ) {
    if ( $OPT{quiet} ) {
        RT::Logger->info("$0 is already running; aborting silently, as requested");
        exit;
    }
    else {
        print STDERR "$0 is already running\n";
        exit 1;
    }
}

my $fts_config = RT->Config->Get('FullTextSearch') || {};
unless ( $fts_config->{'Enable'} ) {
    print STDERR <<EOT;

Full text search is disabled in your RT configuration.  Run
/usr/local/sbin/rt-setup-fulltext-index to configure and enable it.

EOT
    exit 1;
}
unless ( $fts_config->{'Indexed'} ) {
    print STDERR <<EOT;

Full text search is enabled in your RT configuration, but not with any
full-text database indexing -- hence this tool is not required.  Read
the documentation for %FullTextSearch in your RT_Config for more details.

EOT
    exit 1;
}

if ( $db_type eq 'Oracle' ) {
    my $index = $fts_config->{'IndexName'} || 'rt_fts_index';
    $RT::Handle->dbh->do(
        "begin ctx_ddl.sync_index(?, ?); end;", undef,
        $index, $OPT{'memory'}
    );
    exit;
} elsif ( $db_type eq 'mysql' ) {
    unless ($OPT{'xmlpipe2'}) {
        print STDERR <<EOT;

Updates to the external Sphinx index are done via running the sphinx
`indexer` tool:

    indexer rt

EOT
        exit 1;
    }
}

my @types = qw(text html);
foreach my $type ( @types ) {
  REDO:
    my $attachments = attachments($type);
    $attachments->Limit(
        FIELD => 'id',
        OPERATOR => '>',
        VALUE => last_indexed($type)
    );
    $attachments->OrderBy( FIELD => 'id', ORDER => 'asc' );
    $attachments->RowsPerPage( $OPT{'limit'} || 100 );

    my $found = 0;
    while ( my $a = $attachments->Next ) {
        next if filter( $type, $a );
        debug("Found attachment #". $a->id );
        my $txt = extract($type, $a) or next;
        $found++;
        process( $type, $a, $txt );
        debug("Processed attachment #". $a->id );
    }
    finalize( $type, $attachments ) if $found;
    clean( $type );
    goto REDO if $OPT{'all'} and $attachments->Count == ($OPT{'limit'} || 100)
}

sub attachments {
    my $type = shift;
    my $res = RT::Attachments->new( RT->SystemUser );
    my $txn_alias = $res->Join(
        ALIAS1 => 'main',
        FIELD1 => 'TransactionId',
        TABLE2 => 'Transactions',
        FIELD2 => 'id',
    );
    $res->Limit(
        ALIAS => $txn_alias,
        FIELD => 'ObjectType',
        VALUE => 'RT::Ticket',
    );
    my $ticket_alias = $res->Join(
        ALIAS1 => $txn_alias,
        FIELD1 => 'ObjectId',
        TABLE2 => 'Tickets',
        FIELD2 => 'id',
    );
    $res->Limit(
        ALIAS => $ticket_alias,
        FIELD => 'Status',
        OPERATOR => '!=',
        VALUE => 'deleted'
    );

    # On newer DBIx::SearchBuilder's, indicate that making the query DISTINCT
    # is unnecessary because the joins won't produce duplicates.  This
    # drastically improves performance when fetching attachments.
    $res->{joins_are_distinct} = 1;

    return goto_specific(
        suffix => $type,
        error => "Don't know how to find $type attachments",
        arguments => [$res],
    );
}

sub last_indexed {
    my ($type) = (@_);
    return goto_specific(
        suffix => $db_type,
        error => "Don't know how to find last indexed $type attachment for $db_type DB",
        arguments => \@_,
    );
}

sub filter {
    my $type = shift;
    return goto_specific(
        suffix    => $type,
        arguments => \@_,
    );
}

sub extract {
    my $type = shift;
    return goto_specific(
        suffix    => $type,
        error     => "No way to convert $type attachment into text",
        arguments => \@_,
    );
}

sub process {
    return goto_specific(
        suffix    => $db_type,
        error     => "No processer for $db_type DB",
        arguments => \@_,
    );
}

sub finalize {
    return goto_specific(
        suffix    => $db_type,
        arguments => \@_,
    );
}

sub clean {
    return goto_specific(
        suffix    => $db_type,
        arguments => \@_,
    );
}

{
sub last_indexed_mysql {
    my $type = shift;
    my $attr = $RT::System->FirstAttribute('LastIndexedAttachments');
    return 0 unless $attr;
    return 0 unless exists $attr->{ $type };
    return $attr->{ $type } || 0;
}

sub process_mysql {
    my ($type, $attachment, $text) = (@_);

    my $doc = sphinx_template();

    my $element = $doc->createElement('sphinx:document');
    $element->setAttribute( id => $attachment->id );
    $element->appendTextChild( content => $$text );

    $doc->documentElement->appendChild( $element );
}

my $doc = undef;
sub sphinx_template {
    return $doc if $doc;

    require XML::LibXML;
    $doc = XML::LibXML::Document->new('1.0', 'UTF-8');
    my $root = $doc->createElement('sphinx:docset');
    $doc->setDocumentElement( $root );

    my $schema = $doc->createElement('sphinx:schema');
    $root->appendChild( $schema );
    foreach ( qw(content) ) {
        my $field = $doc->createElement('sphinx:field');
        $field->setAttribute( name => $_ );
        $schema->appendChild( $field );
    }

    return $doc;
}

sub finalize_mysql {
    my ($type, $attachments) = @_;
    sphinx_template()->toFH(*STDOUT, 1);
}

sub clean_mysql {
    $doc = undef;
}

}

sub last_indexed_pg {
    my $type = shift;
    my $attachments = attachments( $type );
    my $alias = 'main';
    if ( $fts_config->{'Table'} && $fts_config->{'Table'} ne 'Attachments' ) {
        $alias = $attachments->Join(
            TYPE    => 'left',
            FIELD1 => 'id',
            TABLE2  => $fts_config->{'Table'},
            FIELD2 => 'id',
        );
    }
    $attachments->Limit(
        ALIAS => $alias,
        FIELD => $fts_config->{'Column'},
        OPERATOR => 'IS NOT',
        VALUE => 'NULL',
    );
    $attachments->OrderBy( FIELD => 'id', ORDER => 'desc' );
    $attachments->RowsPerPage( 1 );
    my $res = $attachments->First;
    return 0 unless $res;
    return $res->id;
}

sub process_pg {
    my ($type, $attachment, $text) = (@_);

    my $dbh = $RT::Handle->dbh;
    my $table = $fts_config->{'Table'};
    my $column = $fts_config->{'Column'};

    my $query;
    if ( $table ) {
        if ( my ($id) = $dbh->selectrow_array("SELECT id FROM $table WHERE id = ?", undef, $attachment->id) ) {
            $query = "UPDATE $table SET $column = to_tsvector(?) WHERE id = ?";
        } else {
            $query = "INSERT INTO $table($column, id) VALUES(to_tsvector(?), ?)";
        }
    } else {
        $query = "UPDATE Attachments SET $column = to_tsvector(?) WHERE id = ?";
    }

    my $status = eval { $dbh->do( $query, undef, $$text, $attachment->id ) };
    unless ( $status ) {
        if ( $dbh->err == 7  && $dbh->state eq '54000' ) {
            warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains too many unique words. Error: ". $dbh->errstr;
        } elsif ( $dbh->err == 7 && $dbh->state eq '22021' ) {
            warn "Attachment @{[$attachment->id]} cannot be indexed. Most probably it contains invalid UTF8 bytes. Error: ". $dbh->errstr;
        } else {
            die "error: ". $dbh->errstr;
        }

        # Insert an empty tsvector, so we count this row as "indexed"
        # for purposes of knowing where to pick up
        eval { $dbh->do( $query, undef, "", $attachment->id ) }
            or die "Failed to insert empty tsvector: " . $dbh->errstr;
    }
}

sub attachments_text {
    my $res = shift;
    $res->Limit( FIELD => 'ContentType', VALUE => 'text/plain' );
    return $res;
}

sub extract_text {
    my $attachment = shift;
    my $text = $attachment->Content;
    return undef unless defined $text && length($text);
    return \$text;
}

sub attachments_html {
    my $res = shift;
    $res->Limit( FIELD => 'ContentType', VALUE => 'text/html' );
    return $res;
}

sub filter_html {
    my $attachment = shift;
    if ( my $parent = $attachment->ParentObj ) {
# skip html parts that are alternatives
        return 1 if $parent->id
            && $parent->ContentType eq 'mulitpart/alternative';
    }
    return 0;
}

sub extract_html {
    my $attachment = shift;
    my $text = $attachment->Content;
    return undef unless defined $text && length($text);
# TODO: html -> text
    return \$text;
}

sub goto_specific {
    my %args = (@_);

    my $func = (caller(1))[3];
    $func =~ s/.*:://;
    my $call = $func ."_". lc $args{'suffix'};
    unless ( defined &$call ) {
        return undef unless $args{'error'};
        require Carp; Carp::croak( $args{'error'} );
    }
    @_ = @{ $args{'arguments'} };
    goto &$call;
}


# helper functions
sub debug    { print @_, "\n" if $OPT{debug}; 1 }
sub error    { $RT::Logger->error(_(@_)); 1 }
sub warning  { $RT::Logger->warn(_(@_)); 1 }

=head1 NAME

rt-fulltext-indexer - Indexer for full text search

=head1 DESCRIPTION

This is a helper script to keep full text indexes in sync with data.
Read F<docs/full_text_indexing.pod> for complete details on how and when
to run it.

=head1 AUTHOR

Ruslan Zakirov E<lt>ruz@bestpractical.comE<gt>,
Alex Vandiver E<lt>alexmv@bestpractical.comE<gt>

=cut

__DATA__
