#!/usr/bin/python2.4 """ Demo of linear-time code similarity calculation. Author: Andy Chu (andychu at google dot com) """ import optparse import os import md5 import re import sys import time from pprint import pprint class MultiTimer: """To give a general idea of how long each step takes.""" def __init__(self, loud=False): self.timestamps = [] self.descriptions = [] def checkpoint(self, description=''): print description self.timestamps.append(time.time()) self.descriptions.append(description) def report(self): print "------" print "Report" print "------" # Compute time differences format = "%-60s %.3fs" for i in xrange(len(self.timestamps)-1): interval = self.timestamps[i+1] - self.timestamps[i] print format % (self.descriptions[i], interval) print format % ('Total time', self.timestamps[-1] - self.timestamps[0]) def IndexFiles(files, line_threshold): """ Given a list of filenames, produce an inverted index. Any lines which occur in "line_threshold" files are thrown out. Also, for similarity scoring, keep track of the number of important lines in each file. Files are treated as (unordered) *sets* of (unique) lines. Unimportant lines are duplicates and lines which exceed line_threshold. """ # { line -> list of filenames the line appears in } index = {} # { filename -> number of unique lines in the file } sizes = {} for filename in files: try: f = file(filename) except IOError, e: print e, filename sys.exit(1) for line in f: line = line.strip() # Could remove *all* whitespace here if line: # Skip blank lines if line in index: filelist = index[line] # Stop keeping track once we reach the threshold if len(filelist) == line_threshold + 1: continue # Only count the first occurrence of the line in the file if filelist[-1] != filename: filelist.append(filename) sizes[filename] = sizes.get(filename, 0) + 1 else: index[line] = [filename] sizes[filename] = sizes.get(filename, 0) + 1 f.close() # Now remove any lines that hit the threshold from the index, and adjust the # file sizes. to_remove = [] for line, filelist in index.iteritems(): if len(filelist) == line_threshold + 1: to_remove.append(line) for f in filelist: sizes[f] -= 1 for line in to_remove: del index[line] return index, sizes def FindSimilarPairs(set1, set2, reportfile, line_threshold): """Calculates pairwise similarity between two sets of files, using no other information but the contents of the files (i.e. no filename information). Args: set1, set2: The 2 lists of file system paths to compare. reportfile: name of the output text file line_threshold: We prune the index of entries that occur more than this number of times. For example, the line "} else {" may occur very frequently in the code, but we just throw it out, since the fact that 2 files have this line in common is not very meaningful. Making line_threshold a constant also makes the algorithm linear. In practice, with real data sets, the results should be quite stable with respect to this parameter. That is, they should not vary very much at all if line_threshold = 1 or 100 or 1000. """ mt = MultiTimer() print '%s * %s = %s total possible pairs of files' % ( len(set1), len(set2), len(set1) * len(set2)) # # 1. Generate inverted indices and size information for each set. # mt.checkpoint("Indexing the first set of %d files (threshold=%d)" % ( len(set1), line_threshold)) index1, sizes1 = IndexFiles(set1, line_threshold) mt.checkpoint("Indexing the second set of %d files (threshold=%d)" % ( len(set2), line_threshold)) index2, sizes2 = IndexFiles(set2, line_threshold) print "Sum of file sizes in first set: %s lines" % sum(sizes1.values()) print "Sum of file sizes in second set: %s lines" % sum(sizes2.values()) print "Size of index over first set:", len(index1) print "Size of index over second set:", len(index2) # # 2. Combine the 2 indices to form sparse matrix that counts common lines. # # Pairs which do not appear have an implicit count of 0. # # This is a sparse matrix represented as a dictionary of 2-tuples: # (filename # in set1, filename in set2) -> number of (unique) lines that # they have in common mt.checkpoint("Computing union of lines in the indices") all_lines = set(index1) | set(index2) print 'Total unique lines in both indices: %s' % len(all_lines) mt.checkpoint("Making sparse common line matrix") matrix = {} for line in all_lines: files1 = index1.get(line) files2 = index2.get(line) if files1 and files2: # For every pair of files that contain this line, increment the # corresponding cell in the common line matrix. # # Since we pruned the index, this whole double loop is constant time. If # the line_threshold is 1 (default), then the whole thing is just a # single iteration. for f1 in files1: for f2 in files2: matrix[(f1, f2)] = matrix.get((f1, f2), 0) + 1 mt.checkpoint("Calculating similarity for %s pairs of files" % len(matrix)) # # 3. Calculate the similarity of each cell in the matrix. # # The similarity metric is number of common lines divided by the file size # (and take the greater of the 2). Note that this means that if file1 is # entirely *contained* in file2, then the similarity of the pair will be 1.0. # This is desirable since you can detect if 99,000 lines were added to a 1,000 # line file, etc. Other similarity metrics are possible. # similar_pairs = [] for (file1, file2), num_common in matrix.iteritems(): c = float(num_common) # TODO: Write notes on similarity metric similarity = max(c / sizes1[file1], c / sizes2[file2]) similar_pairs.append((file1, file2, num_common, similarity)) mt.checkpoint("Sorting %d similar pairs" % len(similar_pairs)) similar_pairs.sort(key=lambda x: x[3], reverse=True) mt.checkpoint("Writing similarity report") f = open(reportfile, 'w') for file1, file2, num_common, similarity in similar_pairs: # Put a * after entries where the relative paths are *not* the same, just # for convenience when looking at the report. if file1.split('/')[1:] != file2.split('/')[1:]: mark = '*' else: mark = '' print >> f, '(%4d) %-40s (%4d) %-40s %4d %.3f %s' % ( sizes1[file1], file1, sizes2[file2], file2, num_common, similarity, mark) f.close() print 'Wrote %s' % reportfile mt.checkpoint("End") mt.report() if __name__ == '__main__': parser = optparse.OptionParser() parser.add_option( '-l', '--line-threshold', dest='line_threshold', type='int', default=1, help='ignore lines that occur more than this number of times in either ' 'set') parser.add_option( '-o', '--out', dest='reportfile', default=None, help='Write output to this file instead of the default.') (options, argv) = parser.parse_args() try: left, right = argv[:2] except ValueError: print 'Pass 2 files containing paths as arguments (left tree, right tree).' sys.exit(1) if not options.reportfile: options.reportfile = 'out-%s-vs-%s.%s.txt' % ( os.path.splitext(os.path.basename(left))[0], os.path.splitext(os.path.basename(right))[0], options.line_threshold) set1 = [f.strip() for f in open(left).readlines()] set2 = [f.strip() for f in open(right).readlines()] FindSimilarPairs(set1, set2, options.reportfile, options.line_threshold)