#!/usr/bin/python3 -BbbEIsSttW all """This module implements a simple file system indexer creating a JSON intermediate format to be then imported to the standard backup tools chain. This allows indexing of file systems without having the full Java based backup stack installed. This software is provided by the copyright owner "as is" and without any expressed or implied warranties, including, but not limited to, the implied warranties of merchantability and fitness for a particular purpose are disclaimed. In no event shall the copyright owner be liable for any direct, indirect, incidential, special, exemplary or consequential damages, including, but not limited to, procurement of substitute goods or services, loss of use, data or profits or business interruption, however caused and on any theory of liability, whether in contract, strict liability, or tort, including negligence or otherwise, arising in any way out of the use of this software, even if advised of the possibility of such damage. Copyright (c) 2019 halfdog You may not distribute copies, re-license, sell or make any kind of profit with this software without proper prior written consent from the author. You may not remove this copyright notice, license or terms of use. PS: Restrictive license as I did not have time to publish this backup data quality verification, deduplication, data synchronization tools. I would be be willing to do so if someone assists in getting software distributed as Debian package.""" import hashlib import json import os import re import stat import sys def fileNameToResourceName(fileName): nameBytes = fileName.encode(sys.getdefaultencoding(), errors='surrogateescape') nameBytesSanitized = [chr(val) if val in b'0123456789-.ABCDEFGHIJKLMNOPQRSTUVWXYZ_abcdefghijklmnopqrstuvwxyz~' else ('%%%02x' % (val&0xff)) for val in nameBytes] return ''.join(nameBytesSanitized) def listDir(pathName, parentResourceUrl, excludeRegexList): """List the directory, apply exclude rules, sort all results and return the list suitable for indexing stack use.""" resultList = [] for fileName in os.listdir(pathName): filePathName = os.path.join(pathName, fileName) resourceName = fileNameToResourceName(fileName) fileResourceName = parentResourceUrl+resourceName # Add '/' to directories but not links pointing to directories. if stat.S_ISDIR(os.lstat(filePathName).st_mode): fileResourceName += '/' if excludeRegexList: for regex in excludeRegexList: if regex.match(fileResourceName): fileResourceName = None break if fileResourceName is not None: resultList.append((filePathName, fileResourceName)) resultList.sort(key=lambda x: x[1]) return resultList def createDigest(pathName): """Create a hexadecimal digest string for the file pointed out by pathname.""" digestObj = hashlib.md5() digestFile = open(pathName, 'rb') while True: digestData = digestFile.read(1<<24) if not digestData: break digestObj.update(digestData) digestFile.close() return digestObj.hexdigest() def doIndex(rootPathName, excludeRegexStrList=None, indexStartPath=None, updateIndexFile=None): """Create an index for a given file system part. @param rootPathName byte string pointing to the file system root for indexing. @param excludeRegexStrList a list of regular expression strings to match against the ResourceUrl.""" excludeRegexList = None if excludeRegexStrList: excludeRegexList = [re.compile(regexStr) for regexStr in excludeRegexStrList] oldIndexData = [] oldIndexDataPos = 0 if updateIndexFile: indexFile = open(updateIndexFile, 'rb') oldIndexData = indexFile.read() indexFile.close() oldIndexData = json.loads(str(oldIndexData, 'ascii')) # Have an indexing stack with a sorted list of elements to visit, # the index of the next element and the ResourceUrl up to the # level currently being processed. indexStack = [] indexStack.append([[(rootPathName, '/')], 0, '/']) # Begin the JSON output format list. continuationFlag = False print('[') while indexStack: stackFrame = indexStack[-1] fileSystemPath, resourceUrl = stackFrame[0][stackFrame[1]] # Zero out the element to avoid having all large number of items # linked when not needed any more. stackFrame[0][stackFrame[1]] = None stackFrame[1] += 1 if stackFrame[1] == len(stackFrame[0]): del indexStack[-1] oldRecord = None for oldIndexDataPos in range(oldIndexDataPos, len(oldIndexData)): if oldIndexData[oldIndexDataPos]['url'] < resourceUrl: continue if oldIndexData[oldIndexDataPos]['url'] == resourceUrl: oldRecord = oldIndexData[oldIndexDataPos] break indexResult = {} statData = os.lstat(fileSystemPath) indexResult['group'] = statData.st_gid indexResult['inode'] = statData.st_ino indexResult['length'] = statData.st_size indexResult['mode'] = statData.st_mode & 0o7777 indexResult['mtime'] = statData.st_mtime indexResult['type'] = None indexResult['url'] = resourceUrl indexResult['user'] = statData.st_uid if stat.S_ISDIR(statData.st_mode): indexResult['type'] = 'dir' subResourceList = listDir( fileSystemPath, resourceUrl, excludeRegexList) if subResourceList: indexStack.append([subResourceList, 0, resourceUrl]) elif stat.S_ISFIFO(statData.st_mode): indexResult['type'] = 'pipe' elif stat.S_ISLNK(statData.st_mode): indexResult['type'] = 'link' indexResult['typedata'] = os.readlink(fileSystemPath) elif stat.S_ISREG(statData.st_mode): indexResult['type'] = 'file' # Only this step should be skipped if old and new entry are identical. if oldRecord is not None: indexResult['digest-md5'] = oldRecord['digest-md5'] if oldRecord != indexResult: indexResult['digest-md5'] = createDigest(fileSystemPath) elif stat.S_ISSOCK(statData.st_mode): indexResult['type'] = 'socket' else: raise Exception('Unhandled file type for %s' % fileSystemPath) recordData = json.dumps(indexResult, sort_keys=True) if(continuationFlag): sys.stdout.write(',\n%s' % recordData) else: sys.stdout.write('%s' % recordData) continuationFlag = True print(']') def mainFunction(): """This is the main function to analyze the program call arguments and invoke indexing.""" excludeRegexStrList = [] indexStartPath = None updateIndexFile = None argPos = 1 while argPos < len(sys.argv)-1: argName = sys.argv[argPos] argPos += 1 if argName == '--': break if argName == '--Exclude': excludeRegexStrList.append(sys.argv[argPos]) argPos += 1 continue if argName == '--Include': indexStartPath = sys.argv[argPos] argPos += 1 continue if argName == '--Update': updateIndexFile = sys.argv[argPos] argPos += 1 continue break if argPos+1 != len(sys.argv): print('No indexing root path given (last argument)', file=sys.stderr) sys.exit(1) rootPathName = sys.argv[argPos] doIndex( rootPathName, excludeRegexStrList=excludeRegexStrList, indexStartPath=indexStartPath, updateIndexFile=updateIndexFile) if __name__ == '__main__': mainFunction()