#!/usr/local/bin/python2.2 -O
"""check HTML pages for broken links"""
# Copyright (C) 2000-2002  Bastian Kleineidam
#
# This program is free software; you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation; either version 2 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program; if not, write to the Free Software
# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

# imports and checks
import sys
if not hasattr(sys, 'version_info') or sys.version_info<(2, 1, 0, 'final', 0):
    raise SystemExit, "This program requires Python 2.1 or later."

import getopt, re, os, urlparse, pprint, linkcheck
import linkcheck.timeoutsocket
# set default 30 seconds timeout
linkcheck.timeoutsocket.setDefaultSocketTimeout(30)
# import several helper debugging things
from linkcheck.debuglevels import *
from linkcheck import StringUtil
debug = linkcheck.debug

# main usage text
Usage = linkcheck._("""USAGE\tlinkchecker [options] file-or-url...

OPTIONS
For single-letter option arguments the space is not a necessity. So
'-o colored' is the same as '-ocolored'.
-a, --anchors
        Check HTTP anchor references. Default is don't check anchors.
-C, --cookies
        Accept and send HTTP cookies according to RFC 2109. Only cookies
        which are sent back to the originating server are accepted.
        Sent and accepted cookies are provided as additional logging
        information.
-d, --denyallow
        Swap checking order to extern/intern. Default checking order
        is intern/extern.
-D, --debug
        Print debugging information. Provide this option multiple times
        for even more debugging information.
-e regex, --extern=regex
        Assume urls that match the given expression as extern.
        Only intern HTML links are checked recursively.
-f file, --config=file
        Use file as configuration file. As default LinkChecker first
        searches /etc/linkcheckerrc and then ~/.linkcheckerrc
        (under Windows <path-to-program>\\linkcheckerrc).
-F type[/filename], --file-output=type[/filename]
        Same as -o, but write to a file linkchecker-out.<type>
        or <filename> if specified. If the file already exists, it
        is overwritten. You can specify this option more than once.
        There is no file output for the blacklist logger. Default is
        no file output.
-I, --interactive
        Ask for url if none are given on the commandline.
-i regex, --intern=regex
        Assume URLs that match the given expression as intern.
        LinkChecker descends recursively only to intern URLs, not to extern.
-h, --help
        Help me! Print usage information for this program.
-N server, --nntp-server=server
        Specify an NNTP server for 'news:...' links. Default is the
        environment variable NNTP_SERVER. If no host is given,
        only the syntax of the link is checked.
-o type, --output=type
        Specify output type as %s.
        Default type is text.
-p pwd, --password=pwd
        Try password pwd for HTTP and FTP authorization.
        Default password is 'joe@'. See also -u.
-P secs, --pause=secs
        Pause <secs> seconds between each url check. This option
        implies -t0.
        Default is no pause between requests.
-q, --quiet
        Quiet operation. This is only useful with -F.
-r depth, --recursion-level=depth
        Check recursively all links up to given depth (depth >= 0).
        Default depth is 1.
-s, --strict
        Check only syntax of extern links, do not try to connect to them.
-t num, --threads=num
        Generate no more than num threads. Default number of threads is 5.
        To disable threading specify a non-positive number.
--timeout=secs
        Set the timeout for TCP connection attempts in seconds. The default
        timeout is 10 seconds.
-u name, --user=name
        Try username name for HTTP and FTP authorization.
        Default is 'anonymous'. See also -p.
-V, --version
        Print version and exit.
-v, --verbose
        Log all checked URLs (implies -w). Default is to log only invalid
        URLs.
-w, --warnings
        Log warnings.
-W regex, --warning-regex=regex
        Define a regular expression which prints a warning if it matches
        any content of the checked link.
        This applies of course only to pages which are valid, so we can
        get their content.
        Use this to check for pages that contain some form of error
        message, for example 'This page has moved' or 'Oracle
        Application Server error'.
        This option implies -w.
""") % linkcheck.log.LoggerKeys

Notes = linkcheck._("""NOTES
o LinkChecker assumes an http:// resp. ftp:// link when a commandline URL
  starts with 'www.' resp. 'ftp.'
  You can also give local files as arguments.
o If you have your system configured to automatically establish a
  connection to the internet (e.g. with diald), it will connect when
  checking links not pointing to your local host.
  Use the -s and -i options to prevent this.
o Javascript links are currently ignored.
o If your platform does not support threading, LinkChecker uses -t0.
o You can supply multiple user/password pairs in a configuration file.
o To use proxies set $http_proxy, $https_proxy, $ftp_proxy, $gopher_proxy
  on Unix or Windows.
  On a Mac use the Internet Config.
o When checking 'news:' links the given NNTP host doesn't need to be the
  same as the host of the user browsing your pages!
""")

Examples = linkcheck._("""EXAMPLES
o linkchecker -v -ohtml -r2 -s -itreasure.calvinsplayground.de \\
    http://treasure.calvinsplayground.de/~calvin/ > sample.html
o Local files and syntactic sugar on the command line:
      linkchecker c:\\temp\\test.html
      linkchecker ../bla.html
      linkchecker www.myhomepage.de
      linkchecker -r0 ftp.linux.org
""")

def printVersion ():
    """print the program version and exit"""
    print linkcheck.Config.AppInfo
    sys.exit(0)

def printHelp ():
    """print the program help text"""
    if os.name!='posix':
        StringUtil.paginate(Usage+"\n"+Notes+"\n"+Examples)
    else:
        print Usage
	print Notes
	print Examples
    sys.exit(0)

def printUsage (msg):
    """print a program msg text to stderr"""
    sys.stderr.write(linkcheck._("Error: %s\n") % msg)
    sys.stderr.write(linkcheck._("Execute 'linkchecker -h' for help\n"))
    sys.exit(1)


# Read command line arguments
try:
    # Note: cut out the name of the script
    options, args = getopt.getopt(sys.argv[1:],
    "adCDe:f:F:hIi:N:o:p:P:qr:Rst:u:VvwW:", # short options
    ["anchors",                       # long options
    "config=",
    "cookies",
    "debug", 
    "extern=",
    "file-output=",
    "nntp-server=",
    "help",
    "interactive",
    "intern=",
    "denyallow",
    "output=",
    "password=",
    "pause=",
    "quiet",
    "recursion-level=",
    "wischiwaschi",
    "robots-txt",
    "strict",
    "threads=",
    "timeout=",
    "user=",
    "version",
    "verbose",
    "warnings",
    "warning-regex="])
except getopt.error:
    type, value = sys.exc_info()[:2]
    printUsage(value)

# set debug level as early as possible
for opt,arg in options:
    if opt=="-D" or opt=="--debug":
        linkcheck.Config.DebugLevel += 1
debug(BRING_IT_ON, "Python", sys.version, "on", sys.platform)
# read configuration from config files
config = linkcheck.Config.Configuration()
configfiles = []
for opt,arg in options:
    if opt=="-f" or opt=="--config":
        configfiles.append(arg)
config.read(configfiles)
# disable threading for debugging
if linkcheck.Config.DebugLevel > 0:
    config.disableThreading()
# apply commandline options and arguments
_user = "anonymous"
_password = "guest@"
constructauth = 0
for opt,arg in options:
    if opt=="-a" or opt=="--anchors":
        config["anchors"] = 1

    elif opt=="-e" or opt=="--extern":
        config["externlinks"].append(linkcheck.getLinkPat(arg))

    elif opt=="-h" or opt=="--help":
        printHelp()

    elif opt=="-o" or opt=="--output":
        if linkcheck.log.Loggers.has_key(arg):
            config['log'] = config.newLogger(arg)
        else:
            printUsage((linkcheck._("Illegal argument '%s' for option ") % arg) +\
	               "'-o, --output'")

    elif opt=="-F" or opt=="--file-output":
        ns = {'fileoutput':1}
        try:
            type, ns['filename'] = arg.split('/', 1)
            if not ns['filename']: raise ValueError
        except ValueError: type = arg
        if linkcheck.log.Loggers.has_key(type) and type != "blacklist":
            config['fileoutput'].append(config.newLogger(type, ns))
        else:
            printUsage((linkcheck._("Illegal argument '%s' for option ") % arg) +\
	               "'-F, --file-output'")

    elif opt=="-I" or opt=="--interactive":
        config['interactive'] = 1

    elif opt=="-i" or opt=="--intern":
        config["internlinks"].append(linkcheck.getLinkPat(arg))

    elif opt=="-l" or opt=="--denyallow":
        config["denyallow"] = 1

    elif opt=="-N" or opt=="--nntp-server":
        config["nntpserver"] = arg

    elif opt=="-p" or opt=="--password":
        _password = arg
        constructauth = 1

    elif opt=="-P" or opt=="--pause":
        if int(arg) >= 0:
            config["wait"] = int(arg)
        else:
            printUsage((linkcheck._("Illegal argument '%s' for option ") % arg) +
	               "'-P, --pause'")

    elif opt=="-q" or opt=="--quiet":
    	config["quiet"] = 1

    elif opt=="-r" or opt=="--recursion-level":
        if int(arg) >= 0:
            config["recursionlevel"] = int(arg)
        else:
            printUsage((linkcheck._("Illegal argument '%s' for option ") % arg) +
	               "'-r, --recursion-level'")
    # robots.txt is now default, so ignore this option
    elif opt=="-R" or opt=="--robots-txt": pass

    elif opt=="-s" or opt=="--strict":
        config["strict"] = 1

    elif opt=="-t" or opt=="--threads":
        num = int(arg)
        if config["threads"] and not linkcheck.Config.DebugLevel:
            if num>1:
                config.enableThreading(num)
            else:
                config.disableThreading()

    elif opt=="--timeout":
        linkcheck.timeoutsocket.setDefaultSocketTimeout(int(arg))

    elif opt=="-u" or opt=="--user":
        _user = arg
        constructauth = 1

    elif opt=="-V" or opt=="--version":
        printVersion()

    elif opt=="-v" or opt=="--verbose":
        config["verbose"] = 1
        config["warnings"] = 1

    elif opt=="--wischiwaschi":
        from linkcheck import util1
        util1.abbuzze()
        sys.exit(0)
    elif opt=="-w" or opt=="--warnings":
        config["warnings"] = 1

    elif opt=="-W" or opt=="--warning-regex":
        config["warningregex"] = re.compile(arg)
        config["warnings"] = 1

    elif opt=="-C" or opt=="--cookies":
        config['cookies'] = 1

if constructauth:
    config["authentication"].insert(0, {'pattern': re.compile(".*"),
                                        'user': _user,
					'password': _password})

# construct the url list
# if we use blacklist mode, try to read ~/.blacklist
from linkcheck.log.BlacklistLogger import BlacklistLogger
if config["log"].__class__ == BlacklistLogger and \
   os.path.exists(config['log'].filename):
    args = open(config['log'].filename).readlines()

debug(HURT_ME_PLENTY, "configuration:", pprint.pformat(config.data))

# interactive input
if len(args)==0:
    if config['interactive']:
        urls = raw_input(linkcheck._("enter one or more urls, separated by white-space\n--> "))
        args = urls.split()
    else:
        config.warn(linkcheck._("no files or urls given"))

# syntactic sugar
from linkcheck import UrlData
for url in args:
    url = url.strip()
    if not (":" in url):
        if url.startswith("ftp."):
            url = "ftp://"+url
        elif url.startswith("www."):
            url = "http://"+url
    config.appendUrl(UrlData.GetUrlDataFrom(url, 0, config))

############################# check the urls ################################
linkcheck.checkUrls(config)
#############################################################################

# interactive input end
if config['interactive']:
    raw_input(linkcheck._("Hit RETURN to finish"))
