gtav-src/tools_ng/script/misc/AWStoGraphite/postGraphiteStats.py

import subprocess
import json
import datetime
import time
import calendar
import socket
import os
import re
import thread
#import MySQLdb
import sys
from ftplib import FTP

DEBUG = False
PrintOutDelimeter = ' '
#listOfStats.txt should be in the same folder as this .py file.
FILEPATHOFSTATSTOUPDATE= os.path.join(os.path.dirname(__file__), r'listOfStats.txt')
LOADBALANCERS = ["vpc-elb-rpdxtmweb", "vpc-elb-rpdxcsweb", 'vpc-elb-rpdxtwem']
EC2MACHINENAMEBUCKETS = {'RPDXCSWEB\d+$':'rpdxcsweb',
                                'RPDXNUM\d+$':'rpdxnum',
                                'RPDXREDIS\d+$':'rpdxredis',
                                'RPDXSQL\d+$':'rpdxsql',
                                'RPDXTMWEB\d+$':'rpdxtmweb',
                                'RPDXNUM\d+$':'rpdxnum',
                                'RPDXNUT\d+$':'rpdxnut',
                                'RPDXCSV\d+$':'rpdxcsv'}
EC2VALIDINSTANCEIDS = []
AWSSERVERNAMESPACES = {'ELB':r'AWS/ELB', 'CUSTOM':'"Custom Metrics"', 'EC2':r'AWS/EC2', 'Dynamo':r'AWS/DynamoDB'}
g_startStatTime = None
g_currentTime = None

# what graphite server we are sending to
GRAPHITE_IP = 'tewrgraph1.taketwo.online'
GRAPHITE_PORT = 2003

# what internal sql server we are sending to
SQL_ADDRESS = "RSGSANFPS2/online"
SQL_USER = "online"
SQL_PASSWORD = "ewrewrewr"
SQL_DB = "online"

FTP_HOST="ftp.rockstarsd.com"
FTP_USER="online"
FTP_PASSWORD="ewrewrewr"
FTP_DIRECTORY={'AWS':"GraphiteLogs\\AWS", 'EC2':'GraphiteLogs\\EC2'}

THREAD_COUNTER = 0
LIST_OF_THREADS_FINISHED= []

AWS_CSV_STRING = ''
EC2_CSV_STRING = ''


SOCK = None


"""
HELPER FUNCTIONS
"""

def getLatestTimestampEntry(jsonListOfDict):
    # Actually this gets the 2nd to latest timestamp entry, since the latest one may be incomplete
    return sorted(jsonListOfDict, key=lambda x: datetime.datetime.strptime(x['Timestamp'], "%Y-%m-%dT%H:%M:%SZ"))[-2]

def getAWSStatListFromFile():
    #
    #THIS IS HARDCODING [0] to structured Graphite name of stat
    #                   [1] is going to be the stat name to ping AWS with
    #                   [2] is going to be --statistics ______
    #                   [3] is going to be the metrics server namespace
    #
    f = open(FILEPATHOFSTATSTOUPDATE, 'r')
    listOfMetricNames = []
    for line in f.readlines():
        if not line.isspace():
            items = line.split(';')
            i = 0
            while i < len(items):
                items[i] = items[i].strip()
                i += 1

            listOfMetricNames.append(items)
    f.close()
    return listOfMetricNames

def getSQLTimeString(timeSinceEpoch):
    return time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime(timeSinceEpoch))

def getAWSTimeString():
    current_time = time.gmtime(int(time.time()))
    return "%d-%d-%dT%02d:%02d:%02d" % (current_time[0], current_time[1], current_time[2], current_time[3], current_time[4], current_time[5])


def getCSVFileName(statType):
    timestamp = int(time.time())
    if statType == 'AWS':
        return "aws" + str(timestamp) + ".csv"
    elif statType == 'EC2':
        return 'ec2_' + str(timestamp) + '.csv'

def sendFileToFTP(f, statType):
    ftp = FTP(FTP_HOST)
    ftp.login(user=FTP_USER, passwd=FTP_PASSWORD)
    ftp.cwd(FTP_DIRECTORY[statType])
    ftp.storlines("STOR " + f.name, f)
    ftp.quit()

"""
/HELPER FUNCTIONS
"""


"""
STAT CLASSES
"""

class Stat:
    #Base Stat class to push to Graphite
    structuredOutputName = ""
    timeSinceEpoch = 0
    value = None

    def __init__(self, structuredName):
        self.structuredOutputName = structuredName
        self.timeSinceEpoch = int(time.time())

    def formatData(self):
        #example expected formatting:
        #rsg.stat.stsat.name value timesinceEpcho
        return self.structuredOutputName + PrintOutDelimeter + str(self.value) + PrintOutDelimeter + str(self.timeSinceEpoch) + "\n"

class AWSStat(Stat):
    #Assumption this class is only going to configure for one stat, per load balancer

    #variables
    metricName = ""
    statisticValueName = ""
    loadBalancer = ""
    namespace = ""
    cmdToPoll = ""

    def __init__(self, structuredName, metricName, statisticValueName, namespace, loadBalancer=None):
        Stat.__init__(self, structuredName)
        self.metricName = metricName
        self.statisticValueName = statisticValueName
        self.loadBalancer = loadBalancer
        self.namespace = namespace


    def generateCmdToPoll(self):
        if self.namespace == AWSSERVERNAMESPACES['CUSTOM']:
            self.cmdToPoll = 'aws cloudwatch get-metric-statistics --namespace %s --metric-name "%s" --statistics %s --start-time %s --end-time %s \
                --period 60' % (self.namespace, self.metricName, self.statisticValueName, g_startStatTime, g_currentTime)
        elif self.namespace == AWSSERVERNAMESPACES['ELB']:
            self.cmdToPoll = 'aws cloudwatch get-metric-statistics --namespace %s --metric-name "%s" --statistics %s --dimensions \
                Name=LoadBalancerName,Value=%s --start-time %s --end-time %s --period 60' % (self.namespace, self.metricName, self.statisticValueName, self.loadBalancer, g_startStatTime, g_currentTime)

        # print self.cmdToPoll

    def setValue(self):
        self.generateCmdToPoll()
        jsonDict = json.loads(subprocess.check_output(self.cmdToPoll))
        if jsonDict['Label'] == self.metricName:
            #grab the last point to send
            if len(jsonDict['Datapoints']) == 0:
                #about to crash, dump something useful
                #our cmd line is wrong is you see this
                print self.metricName + self.statisticValueName + self.namespace + self.structuredOutputName
                print json.dumps(jsonDict, indent=4)
                return 2
            self.value = getLatestTimestampEntry(jsonDict['Datapoints'])['%s' % self.statisticValueName]
            return 0
        return 1

    def formatDataForSQL(self):
        #example expected formatting:
        #INSERT INTO table VALUES (x, y, z)
        dbName = "online.dbo.AmazonStats"
        namespace = self.namespace
        metric = self.metricName
        value = self.value
        date = getSQLTimeString(self.timeSinceEpoch)
        graphiteNameSpace = self.structuredOutputName
        return "INSERT INTO " + dbName + " VALUES ('" + namespace + "', '" + metric + "', '" + str(value) + "', '" + str(date) + "', '" + graphiteNameSpace + "')"

    def formatDataForCSV(self):
        #example expected formatting:
        #namespace,metric,value,date,graphiteNameSpace
        namespace = self.namespace
        metric = self.metricName
        value = self.value
        date = getAWSTimeString()
        graphiteNameSpace = self.structuredOutputName
        return namespace + "," + metric + "," + str(value) + "," + str(date) + "," + graphiteNameSpace + '\n'

class DynamoStat(AWSStat):
    operationName = None
    tableName = 'rsg-cloudsaves-gtav'

    def __init__(self, listOfInputs):
        AWSStat.__init__(self, listOfInputs[0], listOfInputs[1], listOfInputs[2], AWSSERVERNAMESPACES[listOfInputs[3]], None)
        if len(listOfInputs) > 4:
            self.operationName = listOfInputs[4]
        self.generateCmdToPoll()
        # print self.cmdToPoll

    def generateCmdToPoll(self):
        if self.namespace == AWSSERVERNAMESPACES['Dynamo']:
            if self.operationName:
                #Table Operation Metric
                self.cmdToPoll = 'aws cloudwatch get-metric-statistics --namespace %s --metric-name "%s" --statistics "%s" --start-time "%s" --end-time "%s" \
                    --period 60 --dimensions "Name=TableName,Value=%s" "Name=Operation,Value=%s"' % \
                    (self.namespace, self.metricName, self.statisticValueName, g_startStatTime, g_currentTime, self.tableName, self.operationName)
            else:
                #Table Metric
                self.cmdToPoll = 'aws cloudwatch get-metric-statistics --namespace %s --metric-name "%s" --statistics "%s" --start-time "%s" --end-time "%s" \
                    --period 60 --dimensions "Name=TableName,Value=%s"' % \
                    (self.namespace, self.metricName, self.statisticValueName, g_startStatTime, g_currentTime, self.tableName)

    def formatDataForCSV(self):
        #example expected formatting:
        #namespace,metric,value,date,graphiteNameSpace
        date = getAWSTimeString()
        if self.operationName:
            return self.namespace + "," + self.operationName + "," + str(self.value) + "," + str(date) + "," + self.structuredOutputName + "\n"
        else:
            return self.namespace + "," + self.metricName + "," + str(self.value) + "," + str(date) + "," + self.structuredOutputName + "\n"


class EC2Stat(AWSStat):
    friendlyEC2Name = ""
    instanceId = ""

    def __init__(self, structuredName, metricName, statisticValueName, namespace, friendlyName, instanceId, LoadBalancer=None):
        AWSStat.__init__(self, structuredName, metricName, statisticValueName, namespace, LoadBalancer)
        self.friendlyEC2Name = friendlyName
        self.instanceId = instanceId

    def generateCmdToPoll(self):
        if self.namespace == AWSSERVERNAMESPACES['EC2']:
            self.cmdToPoll = 'aws cloudwatch get-metric-statistics --namespace %s --metric-name "%s" --statistics "%s" --start-time "%s" --end-time "%s" \
             --period 300 --dimensions "Name=InstanceId,Value=%s"' % (self.namespace, self.metricName, self.statisticValueName, g_startStatTime, g_currentTime, self.instanceId)

    def setValue(self):
        self.generateCmdToPoll()
        jsonDict = json.loads(subprocess.check_output(self.cmdToPoll))
        if jsonDict['Label'] == self.metricName:
            #grab the last point to send
            if len(jsonDict['Datapoints']) > 0:
                #Some machines will have no data
                self.value = getLatestTimestampEntry(jsonDict['Datapoints'])['%s' % self.statisticValueName]
                return 0
            else:
                return 2
        return 1

    def formatDataForCSV(self):
        #example expected formatting:
        #friendlyName,instanceId,metric,value,date,graphiteNameSpace
        namespace = self.namespace
        metric = self.metricName
        value = self.value
        date = getAWSTimeString()
        graphiteNameSpace = self.structuredOutputName
        return self.friendlyEC2Name + ',' + self.instanceId + "," + metric + "," + str(value) + "," + str(date) + "," + graphiteNameSpace + '\n'

class EC2SummedStatCall(object):
    buckets = {}
    averagedValues = {}
    graphiteBaseNamespace = ""
    timeSinceEpoch = 0
    AWSNamespace = ""
    AWSStatValueName = "" #Average,Max,Min, etc.
    MetricName = ""
    sendIndividualStats = True

    def __init__(self, graphiteNamespace, metricName, statisticValueName, namespace, sendIndividualStats=True):
        self.timeSinceEpoch = time.time()
        self.graphiteBaseNamespace = graphiteNamespace
        self.MetricName = metricName
        self.AWSStatValueName = statisticValueName
        self.AWSNamespace = namespace
        self.buckets = {}
        for bucketName in EC2MACHINENAMEBUCKETS:
            self.buckets[EC2MACHINENAMEBUCKETS[bucketName]] = {'Calls':[],'GraphiteNamespace':self.graphiteBaseNamespace.replace('ec2', 'ec2.%s' % EC2MACHINENAMEBUCKETS[bucketName])}
            self.averagedValues[EC2MACHINENAMEBUCKETS[bucketName]] = 0
        self.createEC2StatCalls()

    def createEC2StatCalls(self):
        for EC2Data in EC2VALIDINSTANCEIDS:
            if self.sendIndividualStats:
                #LEAVE newStr. Python passes everything as pointers and it's screwing up __init__ of Stat
                newStr = self.graphiteBaseNamespace.replace('ec2', 'ec2.%s.%s' % (EC2Data[2], EC2Data[0]))
                self.buckets[EC2Data[2]]['Calls'].append(EC2Stat(newStr, self.MetricName, self.AWSStatValueName, self.AWSNamespace, EC2Data[0], EC2Data[1]))
            else:
                newStr = self.graphiteBaseNamespace.replace('ec2', 'ec2.%s' % EC2Data[2])
                self.buckets[EC2Data[2]]['Calls'].append(EC2Stat(newStr, self.MetricName, self.AWSStatValueName, self.AWSNamespace, EC2Data[0], EC2Data[1]))

    def populateBucketCallValues(self):
        for bucket in self.buckets:
            if self.buckets[bucket]['Calls']:
                avgValue = 0
                statBearingEC2 = 0
                for EC2Call in self.buckets[bucket]['Calls']:
                    #individual call to EC2 machine. Will need to be avg'd across all in order to get a value
                    res = EC2Call.setValue()
                    if res == 0:
                        avgValue += EC2Call.value
                        statBearingEC2 += 1
                    elif res == 1:
                        print('ERROR SETTINGS EC2 Value')
                    elif res == 2:
                        print("This EC2 didn't have any datapoints")
                self.averagedValues[bucket] = avgValue / statBearingEC2

    def dumpCSVPrint(self):
        for bucket in self.buckets:
            for ec2call in self.buckets[bucket]['Calls']:
                print 'csv format: ' + ec2call.formatDataForCSV()

    def writeToCSV(self):
        global EC2_CSV_STRING
        for bucket in self.buckets:
            for EC2Call in self.buckets[bucket]['Calls']:
                EC2_CSV_STRING += EC2Call.formatDataForCSV()


    def sendData(self, socket):
        self.populateBucketCallValues()
        for bucket in self.buckets:
            if self.sendIndividualStats:
                #Send Graphite a stat for every InstanceId
                for EC2Call in self.buckets[bucket]['Calls']:
                    outstring = EC2Call.formatData()
                    print outstring
                    socket.sendall(outstring)
            else:
                #Send Graphite a stat for every bucket of EC2 machines
                outstring = bucket['GraphiteNamespace'] + PrintOutDelimeter + str(self.averagedValues[bucket]) + PrintOutDelimeter + str(self.timeSinceEpoch) + '\n'
                print outstring
                socket.sendall(outstring)

"""
//STAT CLASSES
"""

"""
Multi threaded starting function
"""
def processEC2SummedStat(threadName, EC2SummedObj, threadID):
    global SOCK
    print "%s: is starting processing.\n" % threadName
    if not DEBUG:
        EC2SummedObj.sendData(SOCK)
    EC2SummedObj.writeToCSV()
    print "%s: is finished processing.\n" % threadName
    LIST_OF_THREADS_FINISHED[threadID] = True

"""
//Multi threaded starting function
"""


"""
ENTRY POINT
"""

if __name__ == "__main__":
    #entry point

    #Get active EC2 Machines
    ec2Instances = json.loads(
        subprocess.check_output("aws ec2 describe-instances")
        )
    # print json.dumps(ec2Instances['Reservations'][5], indent=4)
    for resv in ec2Instances['Reservations']:
        for inst in resv['Instances']:
            instanceTags = inst.get('Tags')
            if instanceTags and len(instanceTags) and instanceTags[0]['Value']:
                for regex in EC2MACHINENAMEBUCKETS:
                    #Looking at the first one always? IDK if this will be a problem with multiple sets of tags?
                    matchObj = re.match(regex, instanceTags[0]['Value'], flags=0)
                    if matchObj:
                        if inst['State']['Name'] == 'running':
                            #only want active servers
                            #Friendly ServerName : instanceID
                            EC2VALIDINSTANCEIDS.append((instanceTags[0]['Value'], inst['InstanceId'], EC2MACHINENAMEBUCKETS[regex]))
                            break
    #connect to servers

    # graphite web server
    if not DEBUG:
        SOCK = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
        try:
            SOCK.connect((GRAPHITE_IP, GRAPHITE_PORT))
        except:
            print("error could not connect to graphite server")
            sys.exit(1)


    #internal sql db
    #try:
    #    sqlConnection = MySQLdb.connect(host=SQL_ADDRESS, user=SQL_USER, passwd=SQL_PASSWORD, db=SQL_DB)
    #except Exception as e:
    #    print("error could not connect to SQL database")
    #    print(e)
    #    sys.exit(1)

    #csv file
    fAWSname = getCSVFileName('AWS')
    awsFile = file(fAWSname, "w")
    fEC2name = getCSVFileName('EC2')
    ec2File = file(fEC2name, 'w')

    #sqlCursor = sqlConnection.cursor()
    """
    DO ALL THE WORK - Took out the while loop to put this into a scheduled task
    """


    #read file every time so script doesn't need to be restarted
    g_currentTime = time.gmtime(int(time.time()))
    dt = datetime.datetime.fromtimestamp(time.mktime(g_currentTime))
    #[THP] Why we have to subtract this hour? blame python
    pastStructTime = (dt - datetime.timedelta(minutes=5) - datetime.timedelta(hours=1)).timetuple()

    g_currentTime = "%d-%d-%dT%02d:%02d:%02d" % (g_currentTime[0], g_currentTime[1], g_currentTime[2], g_currentTime[3], g_currentTime[4], g_currentTime[5])
    g_startStatTime = "%d-%d-%dT%02d:%02d:%02d" % (pastStructTime[0], pastStructTime[1], pastStructTime[2], pastStructTime[3], pastStructTime[4], pastStructTime[5])

    # print g_currentTime
    # print g_startStatTime

    #CREATE THE CALLS
    AWSStatRequests = []
    # EC2StatRequests = []
    statNames = getAWSStatListFromFile()

    for stat in statNames:
        if AWSSERVERNAMESPACES[stat[3]] == AWSSERVERNAMESPACES['EC2']:
            #Creating EC2Stat
            #TODO: If we want to dump into buckets, add False to the __init__ args

            # Just do everything here. Python likes to copy our dictionaries to point all instance's object to a single self.buckets

            obj = EC2SummedStatCall(stat[0], stat[1], stat[2], AWSSERVERNAMESPACES[stat[3]]  )
            LIST_OF_THREADS_FINISHED.append(False)
            thread.start_new_thread(processEC2SummedStat, ("Thread-%d" % THREAD_COUNTER, obj, THREAD_COUNTER))
            THREAD_COUNTER += 1


        else:
            #Creating AWSStat
            if stat[3] == 'ELB':
                #Make two jobs for loadbalancer specific jobs
                for lb in LOADBALANCERS:
                    AWSStatRequests.append(AWSStat(stat[0].replace('elb','elb.%s' % lb), stat[1], stat[2], AWSSERVERNAMESPACES[stat[3]], lb))
            elif stat[3] == 'CUSTOM':
                #Doesn't matter which loadbalancer we ask as they going to reference third party stat pool
                AWSStatRequests.append(AWSStat(stat[0], stat[1], stat[2], AWSSERVERNAMESPACES[stat[3]]))
            elif stat[3] == 'Dynamo':
                AWSStatRequests.append(DynamoStat(stat))

        # print stat[0]
        # print AWSStatRequests[-1].timeSinceEpoch

    # print len(AWSStatRequests)
    # print len(EC2StatRequests)

#GET THAT DATA
    for AWSCall in AWSStatRequests:
        if AWSCall.setValue():
            print('ERROR WHEN SETTING THE AWS VALUE')
            continue
#SEND THAT DATA

        outstring = AWSCall.formatData()
        #debug printing to console so we can check status
        #send to Graphite
        if not DEBUG:
            SOCK.sendall(outstring)
        print(outstring)


        #send the data to the sql server
        #dbstring = AWSCall.formatDataForSQL()
        #print(dbstring)
        #sqlConnection.execute(dbstring)
        #sqlConnection.commit()

        #send the data to a csv file
        csvstring = AWSCall.formatDataForCSV()
        AWS_CSV_STRING += (csvstring)

    #Wait for all threads to be done processing
    while (True):
        bAllThreadsDone = True
        for thread in LIST_OF_THREADS_FINISHED:
            if thread == False:
                bAllThreadsDone = False
                break
        if bAllThreadsDone:
            break
        print 'All threads arent done.'
        threadStatuses = ''
        for item in LIST_OF_THREADS_FINISHED:
            threadStatuses += str(item) + ' '
        print threadStatuses
        time.sleep(5)


#Write to files now that all the threads are done
    awsFile.write(AWS_CSV_STRING)
    awsFile.close()
    if not DEBUG:
        sendFileToFTP(file(fAWSname, "r"),'AWS')

    ec2File.write(EC2_CSV_STRING)
    ec2File.close()
    if not DEBUG:
        sendFileToFTP(file(fEC2name, 'r'),'EC2')


    #close down
    print("Script terminating")
    if not DEBUG:
        SOCK.close()
        #os.remove(fAWSname)
        #os.remove(fEC2name)

"""
//ENTRY POINT
"""