import hashlib, sys, os, math from struct import * WARN_ON_FILESIZE = 1024 * 1024 * 1024 * 10 ERROR_ON_FILESIZE = 1024 * 1024 * 1024 * 20 MIN_CHUNK_SIZE = 1024 MAX_CHUNK_SIZE = 1024 * 1024 * 2 # 2 MB CHUNK_SIZE_PERCENT = 0.1 # Aim for 10% of each file # Ensure we get a directory as a parameter if len(sys.argv) < 2: sys.exit("Usage: make-manifest-and-hashes.py ") targetdir = sys.argv[1] if not os.path.isdir(targetdir): sys.exit("'" + targetdir + "' is not a directory.") # Binary write function, called once per chunk hashed, and per whole-file chunk def write_hash(hashfile, offset, length, digest): #print hex(offset) + " + " + hex(length) + " (" + str(length) + " bytes) " + digest.hexdigest() hashfile.write(pack("QQ", offset, length)) hashfile.write(digest.digest()) # Progressive hash function, called once per file def calc_hash(fullpath, size): inputFile = open(fullpath, "rb") hashFile = open(fullpath + ".hash", "wb") ongoingDigest = hashlib.sha256() offset = 0 # Pick a decent chunk size targetChunkSize = min(max(size * CHUNK_SIZE_PERCENT, MIN_CHUNK_SIZE), MAX_CHUNK_SIZE) targetChunkSize = int(1024 * (math.ceil(targetChunkSize / 1024.0))) # For large files, display extra progress info bigfile = size > 100 * 1024 * 1024 progressSplit = 10 # Loop over the length of the file while offset < size: # Read a chunk of data chunkSize = min(targetChunkSize, size-offset) data = inputFile.read(chunkSize) # Calculate hash of this chunk chunkDigest = hashlib.sha256(data) # Update the progressive hash of the whole file ongoingDigest.update(data) # Write binary hash write_hash(hashFile, offset, chunkSize, chunkDigest) # Optionally show progress info prevIntProgress = int(progressSplit * offset / size) offset += chunkSize currentIntProgress = int(progressSplit * offset / size) if (bigfile and currentIntProgress > 0 and currentIntProgress != prevIntProgress): print str(100 * currentIntProgress / progressSplit) + "%..." # For files of more than one chunk, add the whole-file hash #if (targetChunkSize < size): # write_hash(hashFile, 0, size, ongoingDigest) inputFile.close() hashFile.close() return ongoingDigest.hexdigest() # Beginning of main code - open the manifest manifest = open(os.path.join(targetdir, "manifest"), "w") # Calculate the amount to trim from full paths to make them relative # (target path + 1 for trailing slash) trimLength = len(targetdir) + 1 largeFileWarnings = 0 # Iterate over all files in target directory, recursing into subdirectories for root, dirs, files in os.walk(targetdir): for f in files: if not (f.endswith(".hash") or f == "manifest"): fullpath = os.path.join(root, f) relativepath = fullpath[trimLength:].replace("\\","/") size = os.path.getsize(fullpath) print relativepath print size if size > ERROR_ON_FILESIZE: print "ERROR: File is greater than " + str(ERROR_ON_FILESIZE) + " bytes (see B*2185704)" print "Aborting." manifest.close() os.remove(os.path.join(targetdir, "manifest")) os.system("pause") sys.exit("Erroneous filesize encountered.") elif size > WARN_ON_FILESIZE: print "WARNING: File is greater than " + str(WARN_ON_FILESIZE) + " bytes (see B*2185704)" largeFileWarnings += 1 hexdigest = calc_hash(fullpath, size) print hexdigest manifest.write(relativepath + "\n") manifest.write(str(size) + "\n") manifest.write(hexdigest + "\n") # Tidy up manifest.close() print "Finished." if largeFileWarnings > 0: print if largeFileWarnings == 1: print "WARNING: 1 file was greater than " + str(WARN_ON_FILESIZE) + " bytes (see B*2185704)" else: print "WARNING: " + str(largeFileWarnings) + " files were greater than " + str(WARN_ON_FILESIZE) + " bytes (see B*2185704)" print os.system("pause")