import sys
from difflib import SequenceMatcher
from collections import namedtuple
from hashlib import shake_128
from re import sub, search

#'t202-201705.log' 't3-201705.log'
filepath = 'node-2017.log'
MIN_LEN = 2
MIN_RATIO = 0.33
CLASS_MIN_RATIO = 0.75
MIN_MATCHING_LENGTH_RATIO = 0.9
HASH_LENGTH = 8
DEBUG = False

logEntry = namedtuple("logEntry", "key epoch date time node facility severity pid daemon message")

eventClass = []

totalLogCount = 0
ignoredLogCount = 0
acceptedLogCount = 0

def logIsProcessed():
    global totalLogCount
    totalLogCount += 1
    if(totalLogCount % 10000 == 0):
        saveIt(eventClass)
    
def logIsIgnored():
    global ignoredLogCount
    ignoredLogCount += 1
    
def logIsAccepted(log):
    global acceptedLogCount
    acceptedLogCount += 1
    #print(log)
    
def simRatio(s,sl,nl):
    
    if (sl == 0):
        return 0.00
    else:
        ratio = (s / sl)
    
    if  (nl <= 1):
        return (ratio * 1.00)
    elif(nl == 2):
        return (ratio * 1.00)
    elif(nl == 3):
        return (ratio * 1.00)
    elif(nl >= 4):
        return (ratio * 1.00)
    
def updatePatterns(msg, ptrn, idx):
    logIsAccepted(msg)
    
    isMeaningFul = False
    for t in msg:
        if (t not in ['$', '#']):
            isMeaningFul = True
            break                                                     ### NOT A USEFUL PATTERN

    if (isMeaningFul):    
        if (idx == -1):                                               ### NEW PATTERN
            eventClass.append({'pattern':ptrn})
        else:                                                         ### UPDATE PATTERN
            for i in range(len(ptrn)):
                if (ptrn[i] == '#'):                                  ### = A VARIABLE TERM
                    if (i in eventClass[idx]):
                        if (eventClass[idx]['pattern'][i] not in ['$', '#']):
                            if (eventClass[idx]['pattern'][i] not in eventClass[idx][i]):
                                eventClass[idx][i].append(eventClass[idx]['pattern'][i])
                        
                        if (msg[i]  not in ['$', '#']):
                            if (msg[i] not in eventClass[idx][i]):
                                eventClass[idx][i].append(msg[i])
                    else:
                        if (msg[i] not in ['$', '#'] or eventClass[idx]['pattern'][i] not in ['$', '#']):
                            eventClass[idx][i] = []
                        if (msg[i] not in ['$', '#']):
                            eventClass[idx][i].append(msg[i])
                        if (eventClass[idx]['pattern'][i] not in ['$', '#']):
                            eventClass[idx][i].append(eventClass[idx]['pattern'][i])
                            
            eventClass[idx]['pattern'] = ptrn
            
def saveIt(eventClass):
        ft = open(filepath+'-Full.classes','w')
        ft.write("Logs: {} - {} = {} ({} Event Classes)\n\n".format(totalLogCount,
                                                                  ignoredLogCount,
                                                                  acceptedLogCount,
                                                                  len(eventClass)))
        sortedEventClass = sorted(eventClass, key=lambda k: k['pattern'])
        for e in sortedEventClass:
            for key in e:
                ft.write("{}: {}\n".format(str(key),' '.join(e[key])))
            ft.write("\n")
        ft.close()

            
with open(filepath) as fp:
    for line in sys.stdin:
#    for line in fp:
        logIsProcessed()
        
        try:
            logMeta, logMsg = line.strip().split(" <> ")
            logMeta = logMeta.split(" ")
        except:
            print(line.strip(), "is broken!")
            logIsIgnored()
            continue

        ### TODO: catch exception! log entry might be brocken.
        logEntry.key = 0
        logEntry.epoch = logMeta[0]
        logEntry.date = logMeta[1] 
        logEntry.time = logMeta[2]
        logEntry.node = logMeta[3]
        logEntry.facility = logMeta[4]
        logEntry.severity = logMeta[5]
        logEntry.pid = logMeta[6]
        logEntry.daemon = logMeta[7]
        logEntry.message = logMsg.strip().lower()                                     ### Converts to lower case

        logEntry.message = sub('\((.*?)\)', '0', logEntry.message)                    ### Removing all parentheses
        logEntry.message = sub('=+', ' ', logEntry.message)                           ### Replacing '=' with ' '
        logEntry.message = sub('[^a-z0-9_\ ]+', '', logEntry.message)                 ### Removing all special characters
        logEntry.message = sub('(\w*\d\w*)+', '0', logEntry.message)                  ### Removing all number-character variables
        logEntry.message = sub('0+', '# ', logEntry.message)                          ### Mergin consecutive 0s to a single $ 
        logEntry.message = sub('\s+', ' ', logEntry.message)                          ### Merging consecutive spaces into a single space
        logEntry.message = logEntry.message.strip()                                   ### Strip additional spaces
        
        #print("\n\n"+logMsg.strip().lower()+"\n"+logEntry.message)
        logEntry.message = logEntry.message.split(' ')
                
        if(eventClass):
            matchingIndex = -1
            matchingRatio = -1
            matchingPattern = []
            for i in range(len(eventClass)):
                if (len(logEntry.message) == len(eventClass[i]['pattern'])):
                    similarity = 0
                    significantLen = 0
                    pattern = ['#'] * len(eventClass[i]['pattern'])
                    for j in range(len(eventClass[i]['pattern'])):
                        if (eventClass[i]['pattern'][j] == '$' or eventClass[i]['pattern'][j] == '#'):
                            continue
                        significantLen += 1
                        if (logEntry.message[j] == eventClass[i]['pattern'][j]):
                            similarity += 1
                            pattern[j] = logEntry.message[j]
                            
                    ratio = simRatio(similarity, significantLen, len(eventClass[i]['pattern']))
                    #ratio = simRatio(similarity, len(eventClass[i]['pattern'])) 
                    
                    if (ratio > matchingRatio):
                        matchingIndex = i
                        matchingRatio = ratio
                        matchingPattern = pattern.copy()
                        
            if (matchingIndex == -1 or matchingRatio < MIN_RATIO):
                updatePatterns(logEntry.message, logEntry.message, -1)
            else:
                updatePatterns(logEntry.message, matchingPattern, matchingIndex)
                        
        else:
            updatePatterns(logEntry.message,logEntry.message, -1)
        
    print("Logs: {} - {} = {} ({} Event Classes)\n".format(totalLogCount,
                                                           ignoredLogCount,
                                                           acceptedLogCount,
                                                           len(eventClass)))
    saveIt(eventClass)
