# - - - - - - - - - - - - - - - - - - - - - - # To Do # 1. [DONE]Unpack on multiple threads AddJobsToQueue should be ok with this # 2. ignore certian extensions ( is this really worth it?) # 3. update op as we go along instead of in one go at the end # - - - - - - - - - - - - - - - - - - - - - - import sys, os, argparse, threading, queue, time, glob, subprocess, shutil import cProfile, io, pstats # - - - - - - - - - - - - - - - - - - - - - - # program options, can be overwritten in command line # - - - - - - - - - - - - - - - - - - - - - - encoding = "ascii" chunkSize = 1024*1024 #1 megabyte steps for now.... threadChunk = 8*1024*2014 #tasks split into 64 MB chunks reportWidth = 32 #the amount of text around the finding listed tempDir = "D:/temp_search/" nThreads = 24 nUnpackThreads = 3 unpacktool = "%RS_TOOLSIRONLIB%/lib/RSG.Pipeline.RpfExtract.exe" lstKeywords = [] #this will be set by a command line argument later on lstInputFiles = [] #this will be set by a command line argument later on taskQueue = queue.Queue() resultQueue = queue.Queue() allFileQueue = queue.Queue() rpfQueue = queue.Queue() #stores all the rpfs we have to cnvert # For storing reports class CReport: def __init__(self, filename, rpf, keyword, chunk, loc): self.filename = filename self.rpf = rpf self.location = loc self.keyword = keyword self.string = chunk def __str__(self): if self.rpf != None: return "\t{1}/{0}:\n\t\toffset:{2}\n\t\t...{3}...\n".format( self.filename[self.filename.rindex(".rpf")+4:], self.rpf, self.location, self.string ) else: return "\t{0}:\n\t\toffset:{1}\n\t\t...{2}...\n".format( self.filename, self.location, self.string ) # Yeilds the bytes between start and start + length in a file in steps of step def bytes_from_file(filename, start=0, length = 1024, step=1024): if not os.path.isfile(filename): return with open(filename, "rb") as f: f.seek(start, 0) total_length = 0 while total_length < length: chunk = f.read( step ) if chunk: yield bytes( chunk ) else: break total_length = total_length + step f.close() #returns the contents of a file as a list where each new line denotes a different item in the list def parse_file(filename): if not os.path.isfile(filename): return with open(filename, "r") as f: while True: line = f.readline() if line: if "#" in line or "//" in line[:2] or len(line) == 0 or "\n" in line[:2]: continue if "\n" in line: yield line[:-1] else: yield line else: break f.close() #todo evaluate the string as a literal def create_list(str): if "[" in str: return [] else: return [f for f in parse_file(str)] #worker class, reads specific bytes from a file and then checks them for our strings class ThreadReadFile(threading.Thread): WAITING = 0 WORKING = 1 killAll = False def __init__(self, queue, rqueue, namae): threading.Thread.__init__(self, name=namae) self.queue = queue self.rqueue = rqueue self.daemon = True self.status = ThreadReadFile.WAITING def run(self): while True: if ThreadReadFile.killAll: #we were told to quit return; try: data = self.queue.get(block=False) file, rpf, start, length = data self.work(file, rpf, start, length) except queue.Empty: self.status = ThreadReadFile.WAITING time.sleep(1.0) except Exception as e: print(str(e)) def work(self, file, rpf, start, length): self.status = ThreadReadFile.WORKING for chunk in bytes_from_file(file, start=start, length=length, step=chunkSize): lchunk = chunk.lower() #Get a list of the words we find lstFound = [keyword for keyword in lstKeywords if bytes(keyword, encoding).lower() in lchunk] for keyword in lstFound: #It might be found multiple times within the chunk, lets just report them #all index = 0 while index < len(lchunk): index = bytes(lchunk).find(bytes(keyword, encoding).lower(), index) if index == -1: break self.rqueue.put(CReport(file, rpf, keyword, chunk[index-reportWidth:index+reportWidth], start + index) ) index += len(keyword) #worker class, unpacks rpf files class ThreadUnpackFile(threading.Thread): WAITING = 0 WORKING = 1 killAll = False def __init__(self, rpfQ, namae): threading.Thread.__init__(self, name=namae) self.iqueue = rpfQ #input comes from here, and some of the results go here too #self.oqueue = fileQ #results go here self.daemon = True self.status = ThreadUnpackFile.WAITING def run(self): while True: if ThreadUnpackFile.killAll: #we were told to quit return; try: data = self.iqueue.get(block=False) file, unpack_dir, rpf_parent = data self.work(file, unpack_dir, rpf_parent) except queue.Empty: self.status = ThreadUnpackFile.WAITING time.sleep(1.0) except Exception as e: print(str(e)) def work(self, file, unpack_dir, rpf_parent): self.status = ThreadUnpackFile.WORKING rpfFile = os.path.abspath(file) unpackDir = os.path.join(unpack_dir,os.path.basename(file)) rpf_parent = os.path.join(rpf_parent, os.path.basename(file)).replace("\\","/") #for result reporting, so we can have a clear idea of what rpf it came from if not os.path.exists(unpackDir): os.makedirs(unpackDir) subprocess.call([unpacktool, "--output", unpackDir, "--overwrite", file]) #add to the queue lstUnpackedFiles = [] lstUnpackedRPFs = [] for dir,_,_ in os.walk(unpackDir): #RPFs may be within RPFS lstTemp = [f.replace("\\","/") for f in glob.glob(os.path.join(dir,"*.*")) if not os.path.isdir(f)] lstUnpackedFiles.extend([f for f in lstTemp if not ".rpf" in f[-4:].lower()]) lstUnpackedRPFs.extend([(f, unpackDir, rpf_parent) for f in set(lstTemp) - set(lstUnpackedFiles)]) #add the normal files to the task queue AddJobsToQueue( [(f, rpf_parent) for f in lstUnpackedFiles], longestKeyLength ) #for f in lstUnpackedFiles: # self.oqueue.put(f) for f in lstUnpackedRPFs: self.iqueue.put(f) #lstAllFiles.extend(lstUnpackedFiles) #add the rpfs to the RPF list we are currently looping through #lstAllRPFFiles.extend(lstUnpackedRPFs) #idx += 1 #delete the rpf file if it is in the tempdir if tempDir.lower() in file.lower(): os.remove( file ) def CheckPath(path): lkw = [keyword for keyword in lstKeywords if keyword.lower() in path] for kw in lkw: resultQueue.put(CReport(path, None, kw, "[in file path]", 0) ) def AddJobsToQueue(lstFiles, keyLength): # prework, setup jobs for threading size = 0 for file, rpfFile in lstFiles: absFile = os.path.abspath(file.replace("\\","/")) size = os.path.getsize(absFile) taskpos = 0; while taskpos < size: #longestKeyLength is on the off chance that the devide happens on a #keyword, splitting it taskQueue.put( ( absFile, rpfFile, max( 0, taskpos-keyLength ), threadChunk ) ) taskpos += threadChunk size += os.path.getsize(absFile) #if we are checking the file path do it here if args.includepath == True: CheckPath(file) allFileQueue.put(file) return size # - - - - - - - - - - - - - - - - - - - - - - # Setup the input args # - - - - - - - - - - - - - - - - - - - - - - parser = argparse.ArgumentParser() parser.add_argument( "files", help=''' input file (a text file conatining a list of files to search separated by new lines) can also be a python style list e.g. ["x:\a.exe","x:\b.bin"] ''') parser.add_argument( "keywords", help="keyword file (a text file conatining a list of keywords to search separated by new lines) ") parser.add_argument( "output", help="text file to dump output to") parser.add_argument( "-t", "--tempdir", help="Location to extract RPFs to default : {0}".format(tempDir) ) parser.add_argument( "-w", "--reportwidth", help="size of surrounding area to be reported. Default : {0}".format(reportWidth) ) parser.add_argument( "-v", "--verbose", help="output extra info", action='store_true' ) parser.add_argument( "-n", "--numthreads", help="number of threads" ) parser.add_argument( "-i", "--includepath", help="include the file path in the search", action='store_true' ) args = parser.parse_args() if not "RS_TOOLSIRONLIB" in os.environ: print("RS_TOOLSIRONLIB not set") sys.exit() unpacktool = os.path.expandvars(unpacktool) # setup based on args if args.files == None or args.keywords == None: parser.print_help() sys.exit() lstInputFiles = create_list(args.files) lstKeywords = create_list(args.keywords) #for kw in lstKeywords: # if longestKeyLength = len(max(lstKeywords, key=len)) op = None try: op = open(args.output, 'w') except OSError: print("output path not valid...output will be dumped to console") op = None # handle error here if not args.tempdir == None: tempDir = args.tempdir tempDir = tempDir.replace("\\","/") if "system32" in tempDir or "c:/" == tempDir or "c://" == tempDir: print("\n\n\nYou have set a dangerous directory for your temp\n\n\n") sys.exit() if not os.path.exists( tempDir ): try: os.makedirs( tempDir ) except: print("\ncouldn't create temporary folder....exiting\n") sys.exit() else: shutil.rmtree( tempDir ) if not args.numthreads == None: try: nThreads = int(args.numthreads) except: nThreads = 1 #these are now both Queues to make them thread safe #lstAllFiles = [] #lstAllRPFFiles = [] #a list of tuples for convenience (filepath, temp_dir, rpf_parent) #this is so we can keep track of where RPFs actually came from lstLooseFiles = [] #for input in lstInputFiles: idx = 0 while idx < len(lstInputFiles): input = lstInputFiles[idx] #files if not os.path.isdir(input): if input.endswith(".rpf"): #lstAllRPFFiles.extend((f, tempDir, f) for f in glob.glob(file)) for f in glob.glob(input): rpfQueue.put((f, tempDir, os.path.dirname(f) )) else: lstTmpFiles = glob.glob(input) for f in lstTmpFiles: if f.endswith(".rpf"): rpfQueue.put( ( f, tempDir, os.path.dirname(f) ) ) else: lstLooseFiles.append(f) #folders else: for dir, subdir, files in os.walk(input): for tmp in files: lstInputFiles.append( os.path.join(dir,tmp) ) idx = idx + 1 AddJobsToQueue( [(f,None) for f in lstLooseFiles], longestKeyLength ) #- - - - - - - - - - - - - - - - - - # Start the worker threads working on on RPF files # and then unpack the RPf ones #- - - - - - - - - - - - - - - - - - lstReadThreads = [] lstUnpackThreads = [] #scans through data looking for keywords for i in range(0, nThreads): lstReadThreads.append( ThreadReadFile( taskQueue, resultQueue, "[ReadThread{0}]".format(i) ) ) #feeds the read threads with unpacked data for i in range(0, nUnpackThreads): lstUnpackThreads.append( ThreadUnpackFile(rpfQueue, "[UnpackThread{0}]".format(i) ) ) print("{0} jobs in total, between {1} threads".format( taskQueue.qsize(), nThreads ) ) for thread in lstReadThreads: thread.start() for thread in lstUnpackThreads: thread.start() #- - - - - - - - - - - - - - - - - - # unpack RPF files here, and add unpacked files to job queue #- - - - - - - - - - - - - - - - - - print("Working") #p = cProfile.Profile() #p.enable() oldnTasks = -1 oldnScan = -1 while True: nU = len([1 for t in lstUnpackThreads if t.status == ThreadUnpackFile.WORKING]) nR = len([1 for t in lstReadThreads if t.status == ThreadReadFile.WORKING]) nTasks = rpfQueue.qsize() nScan = taskQueue.qsize() if oldnTasks != nTasks or oldnScan != nScan: sys.stdout.write("{0} Unpack Jobs left, [{1}/{2}] Threads Active\n".format( nTasks+nU, nU, nUnpackThreads )) sys.stdout.write("{0} Scan Jobs left, [{1}/{2}] Threads Active\n".format( nScan+nR, nR, nThreads )) sys.stdout.flush() if oldnScan != nScan: oldnScan = nScan if oldnTasks != nTasks: oldnTasks = nTasks if nU == 0 and nTasks == 0: #break on unpacking finishing ThreadUnpackFile.killAll = True break time.sleep(2) #possibly redundant for thread in lstUnpackThreads: thread.join() #- - - - - - - - - - - - - - - - # ok we are done unpacking lets # start waiting for the worker threads to finish #- - - - - - - - - - - - - - - - while threading.activeCount() > 1: n = len([1 for t in lstReadThreads if t.status == ThreadReadFile.WORKING]) nTasks = taskQueue.qsize() if oldnScan != n: sys.stdout.write("{0} Scan Jobs left, [{1}/{2}] Threads Active\n".format(nTasks+n, n, nThreads)) sys.stdout.flush() if oldnScan != n: oldnScan = n if n == 0 and nTasks == 0: ThreadReadFile.killAll = True #sys.stdout.write("{0} Jobs left, [{1}/{2}] Threads Active\n".format(nTasks, n, nThreads)) break; time.sleep(2) sys.stdout.flush() # - - - - - - - - - - - - - - - - # sort output data # - - - - - - - - - - - - - - - - nResults = 0 dctGroup = {} while True: try: q = resultQueue.get(False) if not q.keyword in dctGroup: dctGroup[ q.keyword ] = [] dctGroup[ q.keyword ].append( q ) nResults += 1 except queue.Empty: break # - - - - - - - - - - - - - - - - # write output data # - - - - - - - - - - - - - - - - writeFunc = print if op: writeFunc = op.write writeFunc("results:{0}\n".format(nResults)) lstAllFiles = [] while True: try: file = allFileQueue.get(block=False) lstAllFiles.append(file) except queue.Empty: break except Exception as e: break if args.verbose: writeFunc("keywords:\n") for kw in lstKeywords: writeFunc("\t{0}\n".format(kw)) writeFunc("files:\n") for file in lstAllFiles: writeFunc("\t{0}\n".format(file)) writeFunc("number of parsed files : {0}\n".format( len(lstAllFiles) )) #writeFunc("size of all parsed files : {0}\n".format( projected_size )) for key in dctGroup: writeFunc( key+" ({0}) \n".format(len(dctGroup[key])) ) for item in dctGroup[key]: writeFunc(str(item)) if op: op.close() #possibly redundant for thread in lstReadThreads: thread.join()