#!/usr/bin/env python

"""
    logfinder.py -- finds log-like files

    Copyright (C) 2005 Electronic Frontier Foundation

    This program is free software; you can redistribute it and/or modify
    it under the terms of the GNU General Public License as published by
    the Free Software Foundation; either version 2 of the License, or
    (at your option) any later version.

    This program is distributed in the hope that it will be useful,
    but WITHOUT ANY WARRANTY; without even the implied warranty of
    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
    GNU General Public License for more details.

    You should have received a copy of the GNU General Public License
    along with this program; if not, write to the Free Software
    Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.

Written by Seth Schoen.

"""


import os, sys, re, getopt

try:
	import bz2
	bz_module = 1
except:
	bz_module = 0

try:
	import gzip
	gz_module = 1
except:
	gz_module = 0

def bz_open(filename):
	"""Try to open filename as a bz2 file."""
	if bz_module:
		return bz2.BZ2File(filename)
	else:
		raise OSError, "missing bz2 decompressor module for" + filename
	# in principle, on POSIX systems, we could spawn a bzcat
	# process if the bz2 module is unavailable; however, it
	# seems difficult to be able to check the return code
	# (to find out whether it worked) while preserving the
	# ability to cut off the decompression operation after
	# a certain number of bytes and avoid decompressing the
	# entire file.

def gz_open(filename):
	"""Try to open filename as a gz file."""
	if gz_module:
		return gzip.GzipFile(filename)
	else:
		raise OSError, "missing gzip decompressor module for" + filename

# This set of regexes seems to have a lot of false positives, probably
# because of the ease with which the time rule ([0-9]:[0-9][0-9]) can
# match binary files.  Is there a way to tell reliably whether a file is
# a text file or a binary file?  How does file(1) know?

# Chris says: One approach is to look for matches on successive lines, which
# the Perl-style regular expressions can do.  He also says: You can look for
# zero bytes within the file in order to indicate whether it is likely to be
# a binary file.

# preliminary definitions:

searches = []
searches += [(re.compile(r"([0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9])"), "iso-date")]
searches += [(re.compile(r"([0-9]:[0-9][0-9])"), "time")]
searches += [(re.compile(r"[0-9].(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec)"), "date")]
searches += [(re.compile(r"(Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec).[0-9]"), "date")]
searches += [(re.compile(r"([12]?[0-9]?[0-9]\.[12]?[0-9]?[0-9]\.[12]?[0-9]?[0-9]\.[12]?[0-9]?[0-9])"), "ipv4-address")]
searches += [(re.compile(r"[0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9][0-9][0-9]"), "ssn")]
searches += [(re.compile(r"[a-z0-9]+@[a-z0-9]+\.[a-z0-9]+"), "e-mail-address")]
# I feel guilty about this because it isn't a complete regular expression
# for e-mail addresses.  For example, Ian Goldberg has the valid e-mail
# address <n@ai>, which this regular expression fails to match.  We should
# replace this with a strictly correct regular expression; fully qualified
# domain names do not have to contain period characters at all.

# try to add on locale-specific dates (matching dates like Fev/07, 07 Fev,
# etc.) -- do we need to use the locale's date-formatting strings instead?
# will other software actually be using locale-provided date format strings,
# or merely locale-provided day and month names?

try:
	import locale
	if locale.setlocale(locale.LC_ALL, '') != 'C':
		months = [locale.ABMON_1, locale.ABMON_2, locale.ABMON_3, locale.ABMON_4, locale.ABMON_5, locale.ABMON_6, locale.ABMON_7, locale.ABMON_8, locale.ABMON_9, locale.ABMON_10, locale.ABMON_11, locale.ABMON_12]
		local_month_names_re = "|".join(map(locale.nl_langinfo, months))
		searches += [(re.compile(r"[0-9].(" + local_month_names_re + r")"), "localized-date")]
		searches += [(re.compile(r"(" + local_month_names_re + r").[0-9]"), "localized-date")]
except Exception, e:
	# locale was unsupported, too bad
	print e
	pass

def usage():
	print """
***********************************************************************
 Please consider the limitations of this program, which is not able to
 find every possible kind of log file or identify every potential data
 retention concern.  See README for more information.

 For more information about data retention, please consult EFF's
 resources for on-line service providers at <http://www.eff.org/osp/>.
***********************************************************************

Usage:

""" + sys.argv[0] + """[-w] [-l lines] [-c] [-h | --help] [path] [path] [...]
	opts, args = getopt.getopt(sys.argv[1:], "wl:ch?", ["help"])

  With -c or no path specified, look for current logging activity in
  open files systemwide.

  With a path or paths specified, look for log-like text in files within
  the specified path or paths.  By default, look at the first 100 lines
  of such files; if -l is specified, look at the specified number of
  lines instead; if -w is specified, look at the whole file.

  For maximum coverage, specify the path "/".
	"""

def get_fsize(f): # a function this trivial probably doesn't need to exist.
	try:
		return os.stat(f)[6]
	except Exception, e:
		sys.stderr.write(str(e) + "\n")

def uniq(L):
	L.sort()
	o = []
	last = None
	for i in L:
		if i != last: o += [i]
		last = i
	return o

def plausible(m):
	""" Is this list of found items enough to justify printing out
	this file? """
	# This could be a great deal more sophisticated; currently it is
	# the old rule that more than one line matching any single
	# regexp is sufficient to mark the file as a whole as loglike.
	return len(m)>len(uniq(m))

# Not sure you need this. It's slow and bloaty to compile a complete
# list of files, and then examine them. Instead, just attach your
# callback to os.path.walk, and do everything in there, thus
# incrementally searching the filesystem. Faster, smaller, simpler.
class FileSet:
	"""A collection of files to consider."""
	def __init__(self, files = {}):
		self.files = files
		self.capture_open_files()

	def add_file(self, path, filename):
		""" Store file and its size (could be extended to store
		other information. """
		self.files[(path, filename)] = get_fsize(os.path.join(path,filename))

	def add_local_files(self, path, files):
		""" Store many files and their sizes, assuming they are
		all in the current directory."""
		for f in files:
		#for f in filter(os.path.isfile, files):
			self.files[(path, f)] = get_fsize(f)

	def path_walk_callback(self, junk, dirname, files):
		"""Callback function for os.path.walk."""
		for f in files: self.add_file(dirname, f)

	def capture_open_files(self):
		"""Which files are currently open on the system?"""
		# If lsof is installed setuid root and so configured, it
		# will be willing to list files belonging to other users.
		# This needs a way to detect whether lsof will list files
		# belonging to other users, if we aren't root.  Interestingly,
		# lsof --help | tail -1 should reveal this information.
		# I don't know how lsof behaves on Windows, however.
		for f in [f.split()[-1] for f in os.popen("lsof") if f.split()[4] in ["REG", "VREG"]]:
			# redundant activity (os.path.split then os.path.join)
			path, filename = os.path.split(f)
			self.add_file(path, filename)

	def __len__(self):
		"""How many files are currently in this FileSet?"""
		return len(self.files.keys())

	def find_changes(self):
		"""Which files known to this FileSet object have undergone
		interesting changes?  Currently, interesting changes are
		defined as an increase in file size, and, if this is
		present, contents that match a regular expression indicating
		loglikeness.  This could be extended with other tests."""
		for f in self.files.keys():
			path, filename = f
			old_size = self.files[f]
			if get_fsize(os.path.join(path, filename)) > old_size:
				print os.path.join(path, filename), "grew",
				m = search_file(os.path.join(path, filename))
				if m and plausible(m):
					print " ".join(uniq(m))

def search_file(s, offset=0, matches_needed=1, lines=100):
	""" Search for loglike text in the readable file s, starting at
	offset offset.  Returns it if it's found, or false if it's not
	found.  matches_needed is no longer used."""
	# Better error handling than this would be nice.
	try:
		if s[-3:] == ".gz":
			f = gz_open(s)
		elif s[-4:] == ".bz2":
			f = bz_open(s)
		else:
			f = open(s)
		f.seek(offset)
	except Exception, e:
		sys.stderr.write(str(e) + "\n")
		return None
	count = 0
	matched = []
	for L in f:
		for search_tuple in searches:
			if search_tuple[0].search(L):
				matched += [search_tuple[1]]
				# possibly a "continue" belongs here
				# to avoid multiple matches on the same line
		count += 1
		if count == lines:
			break
	return matched

def grep_callback(junk, dirname, files):
	oldcwd = os.getcwd()
	file_list = [os.path.join(dirname, f) for f in files]
	for f in filter(os.path.isfile, file_list):
		# Start at beginning of file, and require _multiple_ lines
		# containing matches.
		m = search_file(f)
		if m and plausible(m):
			print f,
			print " ".join(uniq(m))

def pause():
	# This should be a user-configurable time delay later.
	raw_input("Press Enter to continue. ")

def mainloop():
	opts, args = getopt.getopt(sys.argv[1:], "wl:ch?", ["help"])
	# default value
	lines = 100
	current = 0
	for opt in opts:
		opt_name, opt_value = opt
		if opt_name == "-w":
			lines = 0
		elif opt_name == "-l":
			lines == int(opt_value)
		elif opt_name == "-c":
			current = 1
		elif opt_name in ["-?", "-h", "--help"]:
			usage()
			sys.exit(0)
		else:
			# probably not reached
			usage()
			sys.exit(0)
	if not args:
		# default if no paths are specified
		current = 1
# only give this warning if attempting to use lsof
	if current and os.geteuid():
		sys.stderr.write("""
*** WARNING: POSIX user ID is not root!

This means some files may be unreadable.  It is normally appropriate to run
this program with root privilege.
""")
# Insert code at the beginning that can find lsof no matter where it is,
# or the OS-specific equivalent (fstat on OBSD, for example; or whatever
# Windows uses).
	if current and os.system("lsof -n >/dev/null 2>&1"):
		sys.stderr.write("Could not invoke lsof; current logging activity will not be detected.\n")
		current = 0

	# New logic here:

	# (1) current if requested
	if current:
		print "Scanning for open files systemwide..."
		FS = FileSet()
		print "Scanned sizes of", len(FS), "files."
		print "Waiting for log activity; please allow time to elapse."
		pause()
		FS.find_changes()

	# (2) then do each tree

	for i in args:
		os.path.walk(i, grep_callback, None)

	# (3) then we're done
	sys.exit(0)

# BEGIN MAIN PROGRAM

if __name__ == "__main__":
	mainloop()
