#!/usr/bin/python2.4

"""
Demo of linear-time code similarity calculation.

Author: Andy Chu (andychu at google dot com)
"""

import optparse
import os
import md5
import re
import sys
import time
from pprint import pprint


class MultiTimer:
  """To give a general idea of how long each step takes."""

  def __init__(self, loud=False):
    self.timestamps = []
    self.descriptions = []

  def checkpoint(self, description=''):
    print description
    self.timestamps.append(time.time())
    self.descriptions.append(description)

  def report(self):
    print "------"
    print "Report"
    print "------"

    # Compute time differences
    format = "%-60s %.3fs"
    for i in xrange(len(self.timestamps)-1):
      interval = self.timestamps[i+1] - self.timestamps[i]
      print format % (self.descriptions[i], interval)
    print format % ('Total time', self.timestamps[-1] - self.timestamps[0])


def IndexFiles(files, line_threshold):
  """
  Given a list of filenames, produce an inverted index.
  
  Any lines which occur in "line_threshold" files are thrown out.

  Also, for similarity scoring, keep track of the number of important lines in
  each file.  Files are treated as (unordered) *sets* of (unique) lines.
  Unimportant lines are duplicates and lines which exceed line_threshold.
  """
  # { line -> list of filenames the line appears in }
  index = {}  
  # { filename -> number of unique lines in the file }
  sizes = {}

  for filename in files:
    try:
      f = file(filename)
    except IOError, e:
      print e, filename
      sys.exit(1)

    for line in f:
      line = line.strip()  # Could remove *all* whitespace here
      if line:  # Skip blank lines
        if line in index:
          filelist = index[line]
          # Stop keeping track once we reach the threshold
          if len(filelist) == line_threshold + 1:
            continue
          # Only count the first occurrence of the line in the file
          if filelist[-1] != filename:
            filelist.append(filename)
            sizes[filename] = sizes.get(filename, 0) + 1
        else:
          index[line] = [filename]
          sizes[filename] = sizes.get(filename, 0) + 1
    f.close()

  # Now remove any lines that hit the threshold from the index, and adjust the
  # file sizes.
  to_remove = []
  for line, filelist in index.iteritems():
    if len(filelist) == line_threshold + 1:
      to_remove.append(line)
      for f in filelist:
        sizes[f] -= 1
  for line in to_remove:
    del index[line]

  return index, sizes


def FindSimilarPairs(set1, set2, reportfile, line_threshold):
  """Calculates pairwise similarity between two sets of files, using no other
  information but the contents of the files (i.e. no filename information).

  Args:
    set1, set2: The 2 lists of file system paths to compare.
    reportfile: name of the output text file
    line_threshold:
      We prune the index of entries that occur more than this number of times.
      For example, the line "} else {" may occur very frequently in the code,
      but we just throw it out, since the fact that 2 files have this line in
      common is not very meaningful.
      
      Making line_threshold a constant also makes the algorithm linear.  In
      practice, with real data sets, the results should be quite stable with
      respect to this parameter.  That is, they should not vary very much at
      all if line_threshold = 1 or 100 or 1000.
  """
  mt = MultiTimer()

  print '%s * %s = %s total possible pairs of files' % (
      len(set1), len(set2), len(set1) * len(set2))

  #
  # 1. Generate inverted indices and size information for each set.
  #
  mt.checkpoint("Indexing the first set of %d files (threshold=%d)" % (
    len(set1), line_threshold))
  index1, sizes1 = IndexFiles(set1, line_threshold)

  mt.checkpoint("Indexing the second set of %d files (threshold=%d)" % (
    len(set2), line_threshold))
  index2, sizes2 = IndexFiles(set2, line_threshold)

  print "Sum of file sizes in first set: %s lines" % sum(sizes1.values())
  print "Sum of file sizes in second set: %s lines" % sum(sizes2.values())
  print "Size of index over first set:", len(index1)
  print "Size of index over second set:", len(index2)

  #
  # 2. Combine the 2 indices to form sparse matrix that counts common lines.
  #
  # Pairs which do not appear have an implicit count of 0.
  #
  # This is a sparse matrix represented as a dictionary of 2-tuples:
  # (filename # in set1, filename in set2) -> number of (unique) lines that
  # they have in common

  mt.checkpoint("Computing union of lines in the indices")
  all_lines = set(index1) | set(index2)
  print 'Total unique lines in both indices: %s' % len(all_lines)

  mt.checkpoint("Making sparse common line matrix")

  matrix = {}
  for line in all_lines:
    files1 = index1.get(line)
    files2 = index2.get(line)
    if files1 and files2:
      # For every pair of files that contain this line, increment the
      # corresponding cell in the common line matrix.  
      #
      # Since we pruned the index, this whole double loop is constant time.  If
      # the line_threshold is 1 (default), then the whole thing is just a
      # single iteration.  
      for f1 in files1:
        for f2 in files2:
          matrix[(f1, f2)] = matrix.get((f1, f2), 0) + 1

  mt.checkpoint("Calculating similarity for %s pairs of files" % len(matrix))

  #
  # 3. Calculate the similarity of each cell in the matrix.
  #
  # The similarity metric is number of common lines divided by the file size
  # (and take the greater of the 2).  Note that this means that if file1 is
  # entirely *contained* in file2, then the similarity of the pair will be 1.0.
  # This is desirable since you can detect if 99,000 lines were added to a 1,000
  # line file, etc.  Other similarity metrics are possible.
  #
  similar_pairs = []
  for (file1, file2), num_common in matrix.iteritems():
    c = float(num_common)
    # TODO: Write notes on similarity metric
    similarity = max(c / sizes1[file1], c / sizes2[file2])
    similar_pairs.append((file1, file2, num_common, similarity))

  mt.checkpoint("Sorting %d similar pairs" % len(similar_pairs))
  similar_pairs.sort(key=lambda x: x[3], reverse=True)

  mt.checkpoint("Writing similarity report")
  f = open(reportfile, 'w')
  for file1, file2, num_common, similarity in similar_pairs:
    # Put a * after entries where the relative paths are *not* the same, just
    # for convenience when looking at the report.
    if file1.split('/')[1:] != file2.split('/')[1:]:
      mark = '*'
    else:
      mark = ''

    print >> f, '(%4d) %-40s (%4d) %-40s %4d %.3f %s' % (
        sizes1[file1], file1, sizes2[file2], file2, num_common, similarity,
        mark)
  f.close()
  print 'Wrote %s' % reportfile

  mt.checkpoint("End")
  mt.report()


if __name__ == '__main__':
  parser = optparse.OptionParser()

  parser.add_option(
      '-l', '--line-threshold', dest='line_threshold', type='int', default=1,
      help='ignore lines that occur more than this number of times in either '
      'set')
  parser.add_option(
      '-o', '--out', dest='reportfile', default=None,
      help='Write output to this file instead of the default.')

  (options, argv) = parser.parse_args()

  try:
    left, right = argv[:2]
  except ValueError:
    print 'Pass 2 files containing paths as arguments (left tree, right tree).'
    sys.exit(1)

  if not options.reportfile:
    options.reportfile = 'out-%s-vs-%s.%s.txt' % (
        os.path.splitext(os.path.basename(left))[0],
        os.path.splitext(os.path.basename(right))[0],
        options.line_threshold)

  set1 = [f.strip() for f in open(left).readlines()]
  set2 = [f.strip() for f in open(right).readlines()]
  FindSimilarPairs(set1, set2, options.reportfile, options.line_threshold)