gtav-src/tools_ng/script/util/rpfsearch.py

# - - - - - - - - - - - - - - - - - - - - - -
# To Do
#	1. [DONE]Unpack on multiple threads AddJobsToQueue should be ok with this
#	2. ignore certian extensions ( is this really worth it?)
#	3. update op as we go along instead of in one go at the end
# - - - - - - - - - - - - - - - - - - - - - -
import sys, os, argparse, threading, queue, time, glob, subprocess, shutil
import cProfile, io, pstats

# - - - - - - - - - - - - - - - - - - - - - -
# program options, can be overwritten in command line
# - - - - - - - - - - - - - - - - - - - - - -
encoding = "ascii"
chunkSize = 1024*1024		#1 megabyte steps for now....
threadChunk = 8*1024*2014	#tasks split into 64 MB chunks
reportWidth = 32			#the amount of text around the finding listed
tempDir = "D:/temp_search/"
nThreads = 24
nUnpackThreads = 3
unpacktool = "%RS_TOOLSIRONLIB%/lib/RSG.Pipeline.RpfExtract.exe"

lstKeywords = []		#this will be set by a command line argument later on
lstInputFiles = []			#this will be set by a command line argument later on

taskQueue = queue.Queue()
resultQueue = queue.Queue()

allFileQueue = queue.Queue()
rpfQueue = queue.Queue()  #stores all the rpfs we have to cnvert


# For storing reports
class CReport:
	def __init__(self, filename, rpf, keyword, chunk, loc):
		self.filename = filename
		self.rpf = rpf
		self.location = loc
		self.keyword = keyword
		self.string = chunk

	def __str__(self):
		if self.rpf != None:
			return "\t{1}/{0}:\n\t\toffset:{2}\n\t\t...{3}...\n".format( self.filename[self.filename.rindex(".rpf")+4:], self.rpf, self.location, self.string )
		else:
			return "\t{0}:\n\t\toffset:{1}\n\t\t...{2}...\n".format( self.filename, self.location, self.string )

# Yeilds the bytes between start and start + length in a file in steps of step
def bytes_from_file(filename, start=0, length = 1024, step=1024):
	if not os.path.isfile(filename):
		return
	with open(filename, "rb") as f:
		f.seek(start, 0)
		total_length = 0
		while total_length < length:
			chunk = f.read( step )
			if chunk:
				yield bytes( chunk )
			else:
				break
			total_length = total_length + step
		f.close()

#returns the contents of a file as a list where each new line denotes a different item in the list
def parse_file(filename):
	if not os.path.isfile(filename):
		return

	with open(filename, "r") as f:
		while True:
			line = f.readline()
			if line:
				if "#" in line or "//" in line[:2] or len(line) == 0 or "\n" in line[:2]:
					continue
				if "\n" in line:
					yield line[:-1]
				else:
					yield line
			else:
				break
		f.close()

#todo  evaluate the string as a literal
def create_list(str):
	if "[" in str:
		return []
	else:
		return [f for f in parse_file(str)]

#worker class, reads specific bytes from a file and then checks them for our strings
class ThreadReadFile(threading.Thread):
	WAITING = 0
	WORKING = 1
	killAll = False
	def __init__(self, queue, rqueue, namae):
		threading.Thread.__init__(self, name=namae)
		self.queue = queue
		self.rqueue = rqueue
		self.daemon = True
		self.status = ThreadReadFile.WAITING

	def run(self):
		while True:
			if ThreadReadFile.killAll: #we were told to quit
				return;
			try:
				data = self.queue.get(block=False)
				file, rpf, start, length = data
				self.work(file, rpf, start, length)
			except queue.Empty:
				self.status = ThreadReadFile.WAITING
				time.sleep(1.0)
			except Exception as e:
				print(str(e))

	def work(self, file, rpf, start, length):
		self.status = ThreadReadFile.WORKING
		for chunk in bytes_from_file(file, start=start, length=length, step=chunkSize):

			lchunk = chunk.lower()
			#Get a list of the words we find
			lstFound = [keyword for keyword in lstKeywords if bytes(keyword, encoding).lower() in lchunk]

			for keyword in lstFound:
				#It might be found multiple times within the chunk, lets just report them
				#all
				index = 0
				while index < len(lchunk):
					index = bytes(lchunk).find(bytes(keyword, encoding).lower(), index)
					if index == -1:
						break

					self.rqueue.put(CReport(file, rpf, keyword, chunk[index-reportWidth:index+reportWidth], start + index) )
					index += len(keyword)


#worker class, unpacks rpf files
class ThreadUnpackFile(threading.Thread):
	WAITING = 0
	WORKING = 1
	killAll = False
	def __init__(self, rpfQ, namae):
		threading.Thread.__init__(self, name=namae)
		self.iqueue = rpfQ	#input comes from here, and some of the results go here too
		#self.oqueue = fileQ #results go here
		self.daemon = True
		self.status = ThreadUnpackFile.WAITING

	def run(self):
		while True:
			if ThreadUnpackFile.killAll: #we were told to quit
				return;
			try:
				data = self.iqueue.get(block=False)
				file, unpack_dir, rpf_parent = data
				self.work(file, unpack_dir, rpf_parent)
			except queue.Empty:
				self.status = ThreadUnpackFile.WAITING
				time.sleep(1.0)
			except Exception as e:
				print(str(e))

	def work(self, file, unpack_dir, rpf_parent):
		self.status = ThreadUnpackFile.WORKING

		rpfFile = os.path.abspath(file)

		unpackDir = os.path.join(unpack_dir,os.path.basename(file))
		rpf_parent =  os.path.join(rpf_parent, os.path.basename(file)).replace("\\","/") #for result reporting, so we can have a clear idea of what rpf it came from

		if not os.path.exists(unpackDir):
			os.makedirs(unpackDir)

		subprocess.call([unpacktool, "--output", unpackDir, "--overwrite", file])

		#add to the queue
		lstUnpackedFiles = []
		lstUnpackedRPFs = []
		for dir,_,_ in os.walk(unpackDir):
			#RPFs may be within RPFS
			lstTemp = [f.replace("\\","/") for f in glob.glob(os.path.join(dir,"*.*")) if not os.path.isdir(f)]
			lstUnpackedFiles.extend([f for f in lstTemp if not ".rpf" in f[-4:].lower()])
			lstUnpackedRPFs.extend([(f, unpackDir, rpf_parent) for f in set(lstTemp) - set(lstUnpackedFiles)])

		#add the normal files to the task queue
		AddJobsToQueue( [(f, rpf_parent) for f in lstUnpackedFiles], longestKeyLength )

		#for f in lstUnpackedFiles:
		#	self.oqueue.put(f)
		for f in lstUnpackedRPFs:
			self.iqueue.put(f)

		#lstAllFiles.extend(lstUnpackedFiles)
		#add the rpfs to the RPF list we are currently looping through
		#lstAllRPFFiles.extend(lstUnpackedRPFs)
		#idx += 1

		#delete the rpf file if it is in the tempdir
		if tempDir.lower() in file.lower():
			os.remove( file )

def CheckPath(path):
	lkw = [keyword for keyword in lstKeywords if keyword.lower() in path]
	for kw in lkw:
		resultQueue.put(CReport(path, None, kw, "[in file path]", 0) )


def AddJobsToQueue(lstFiles, keyLength):
	# prework, setup jobs for threading
	size = 0
	for file, rpfFile in lstFiles:
		absFile = os.path.abspath(file.replace("\\","/"))
		size = os.path.getsize(absFile)
		taskpos = 0;
		while taskpos < size:
			#longestKeyLength is on the off chance that the devide happens on a
			#keyword, splitting it
			taskQueue.put( ( absFile, rpfFile, max( 0, taskpos-keyLength ), threadChunk ) )
			taskpos += threadChunk
		size += os.path.getsize(absFile)
		#if we are checking the file path do it here
		if args.includepath == True:
			CheckPath(file)

		allFileQueue.put(file)

	return size

# - - - - - - - - - - - - - - - - - - - - - -
# Setup the input args
# - - - - - - - - - - - - - - - - - - - - - -

parser = argparse.ArgumentParser()
parser.add_argument( "files", help='''
input file (a text file conatining a list of files to search separated by new lines)
can also be a python style list e.g. ["x:\a.exe","x:\b.bin"]
''')
parser.add_argument( "keywords", help="keyword file (a text file conatining a list of keywords to search separated by new lines) ")
parser.add_argument( "output", help="text file to dump output to")
parser.add_argument( "-t", "--tempdir", help="Location to extract RPFs to default : {0}".format(tempDir) )
parser.add_argument( "-w", "--reportwidth", help="size of surrounding area to be reported. Default : {0}".format(reportWidth) )
parser.add_argument( "-v", "--verbose", help="output extra info", action='store_true' )
parser.add_argument( "-n", "--numthreads", help="number of threads" )
parser.add_argument( "-i", "--includepath", help="include the file path in the search", action='store_true' )
args = parser.parse_args()

if not "RS_TOOLSIRONLIB" in os.environ:
	print("RS_TOOLSIRONLIB not set")
	sys.exit()
unpacktool = os.path.expandvars(unpacktool)

# setup based on args
if args.files == None or args.keywords == None:
	parser.print_help()
	sys.exit()

lstInputFiles = create_list(args.files)
lstKeywords = create_list(args.keywords)
#for kw in lstKeywords:
#	if
longestKeyLength = len(max(lstKeywords, key=len))

op = None
try:
	op = open(args.output, 'w')
except OSError:
	print("output path not valid...output will be dumped to console")
	op = None
	# handle error here

if not args.tempdir == None:
	tempDir = args.tempdir
	tempDir = tempDir.replace("\\","/")
	if "system32" in tempDir or "c:/" == tempDir or "c://" == tempDir:
		print("\n\n\nYou have set a dangerous directory for your temp\n\n\n")
		sys.exit()


if not os.path.exists( tempDir ):
	try:
		os.makedirs( tempDir )
	except:
		print("\ncouldn't create temporary folder....exiting\n")
		sys.exit()
else:
	shutil.rmtree( tempDir )

if not args.numthreads == None:
	try:
		nThreads = int(args.numthreads)
	except:
		nThreads = 1

#these are now both Queues to make them thread safe
#lstAllFiles = []
#lstAllRPFFiles = [] #a list of tuples for convenience (filepath, temp_dir, rpf_parent)
					#this is so we can keep track of where RPFs actually came from

lstLooseFiles = []
#for input in lstInputFiles:
idx = 0
while idx < len(lstInputFiles):
	input = lstInputFiles[idx]
	#files
	if not os.path.isdir(input):
		if input.endswith(".rpf"):
			#lstAllRPFFiles.extend((f, tempDir, f) for f in glob.glob(file))
			for f in glob.glob(input):
				rpfQueue.put((f, tempDir, os.path.dirname(f) ))
		else:
			lstTmpFiles = glob.glob(input)
			for f in lstTmpFiles:
				if f.endswith(".rpf"):
					rpfQueue.put( ( f, tempDir, os.path.dirname(f) ) )
				else:
					lstLooseFiles.append(f)
	#folders
	else:
		for dir, subdir, files in os.walk(input):
			for tmp in files:
				lstInputFiles.append( os.path.join(dir,tmp) )
	idx = idx +  1

AddJobsToQueue( [(f,None) for f in lstLooseFiles], longestKeyLength )

#- - - - - - - - - - - - - - - - - -
# Start the worker threads working on on RPF files
# and then unpack the RPf ones
#- - - - - - - - - - - - - - - - - -
lstReadThreads = []
lstUnpackThreads = []

#scans through data looking for keywords
for i in range(0, nThreads):
	lstReadThreads.append( ThreadReadFile( taskQueue, resultQueue, "[ReadThread{0}]".format(i) ) )

#feeds the read threads with unpacked data
for i in range(0, nUnpackThreads):
	lstUnpackThreads.append( ThreadUnpackFile(rpfQueue, "[UnpackThread{0}]".format(i) ) )

print("{0} jobs in total, between {1} threads".format( taskQueue.qsize(), nThreads ) )

for thread in lstReadThreads:
	thread.start()

for thread in lstUnpackThreads:
	thread.start()

#- - - - - - - - - - - - - - - - - -
# unpack RPF files here, and add unpacked files to job queue
#- - - - - - - - - - - - - - - - - -
print("Working")

#p = cProfile.Profile()
#p.enable()

oldnTasks = -1
oldnScan = -1
while True:
	nU = len([1 for t in lstUnpackThreads if t.status == ThreadUnpackFile.WORKING])
	nR = len([1 for t in lstReadThreads if t.status == ThreadReadFile.WORKING])
	nTasks = rpfQueue.qsize()
	nScan = taskQueue.qsize()

	if oldnTasks != nTasks or oldnScan != nScan:
		sys.stdout.write("{0} Unpack Jobs left, [{1}/{2}] Threads Active\n".format( nTasks+nU, nU, nUnpackThreads ))
		sys.stdout.write("{0} Scan Jobs left, [{1}/{2}] Threads Active\n".format( nScan+nR, nR, nThreads ))
		sys.stdout.flush()
		if oldnScan != nScan:
			oldnScan = nScan
		if oldnTasks != nTasks:
			oldnTasks = nTasks

	if nU == 0 and nTasks == 0: #break on unpacking finishing
		ThreadUnpackFile.killAll = True
		break

	time.sleep(2)

#possibly redundant
for thread in lstUnpackThreads:
	thread.join()

#- - - - - - - - - - - - - - - -
# ok we are done unpacking lets
# start waiting for the worker threads to finish
#- - - - - - - - - - - - - - - -

while threading.activeCount() > 1:
	n = len([1 for t in lstReadThreads if t.status == ThreadReadFile.WORKING])
	nTasks = taskQueue.qsize()
	if oldnScan != n:
		sys.stdout.write("{0} Scan Jobs left, [{1}/{2}] Threads Active\n".format(nTasks+n, n, nThreads))
		sys.stdout.flush()
		if oldnScan != n:
			oldnScan = n
	if n == 0 and nTasks == 0:
		ThreadReadFile.killAll = True
		#sys.stdout.write("{0} Jobs left, [{1}/{2}] Threads Active\n".format(nTasks, n, nThreads))
		break;
	time.sleep(2)
sys.stdout.flush()

# - - - - - - - - - - - - - - - -
# sort output data
# - - - - - - - - - - - - - - - -
nResults = 0
dctGroup = {}
while True:
	try:
		q = resultQueue.get(False)
		if not q.keyword in dctGroup:
			dctGroup[ q.keyword ] = []
		dctGroup[ q.keyword ].append( q )
		nResults += 1
	except queue.Empty:
		break
# - - - - - - - - - - - - - - - -
# write output data
# - - - - - - - - - - - - - - - -
writeFunc = print
if op:
	writeFunc = op.write

writeFunc("results:{0}\n".format(nResults))

lstAllFiles = []
while True:
	try:
		file = allFileQueue.get(block=False)
		lstAllFiles.append(file)
	except queue.Empty:
		break
	except Exception as e:
		break

if args.verbose:
	writeFunc("keywords:\n")
	for kw in lstKeywords:
		writeFunc("\t{0}\n".format(kw))
	writeFunc("files:\n")
	for file in lstAllFiles:
		writeFunc("\t{0}\n".format(file))

	writeFunc("number of parsed files : {0}\n".format( len(lstAllFiles) ))
	#writeFunc("size of all parsed files : {0}\n".format( projected_size ))

for key in dctGroup:
	writeFunc( key+" ({0}) \n".format(len(dctGroup[key])) )
	for item in dctGroup[key]:
		writeFunc(str(item))

if op:
	op.close()

#possibly redundant
for thread in lstReadThreads:
	thread.join()