# Import
import re
import numpy as np
import datetime

# Read all log
def single(fn,Params):

    # Log's key name, regular expression(Attention!!! to ["]!!! ) ,filter settings and keytype 
    # (if you change key type, you have to delete tables already exists in sql database)
    logs  = {'BucketOwner'     :['(.{64}) '    , 0 ,'integer'  ], # The owner ID of the monitored bucket
             'Bucket'          :[ '(\S+) '     , 0 ,'text'     ], # The monitored bucket
             'TimeStamp'       :['\[(.{26})\] ', 1 ,'timestamp with time zone'], # The timestamp of the action
             'RemoteIp'        :[ '(\S+) '     , 1 ,'inet'     ], # The apparent IP of the requester.
             'Requester'       :[ '(\S+) '     , 1 ,'text'     ], # The AccessKey used to perform the action
             'RequestId'       :[ '(\S+) '     , 0 ,'text'     ], # The Request ID
             'Operation'       :[ '(\S+) '     , 1 ,'text'     ], # The kind of operarion
             'Key'             :[ '(\S+) '     , 1 ,'text'     ], # The object Key (if present)
             'RequestUri'      :['"(\S+ \S+)" ', 0 ,'text'     ], # The Request URI part of the HTTP request message
             'HttpStatus'      :[ '(\S+) '     , 1 ,'text'     ], # The numeric HTTP status code of the response
             'ErrorCode'       :[ '(\S+) '     , 1 ,'text'     ], # The S3 Error Code, or if no error occurred
             'BytesSent'       :[ '(\S+) '     , 1 ,'integer'  ], # The transferred bytes
             'ObjectSize'      :[ '(\S+) '     , 1 ,'integer'  ], # The size of the o bject transferred
             'TotalTime'       :[ '(\S+) '     , 1 ,'integer'  ], # The number of milliseconds the request was in flight from the server's perspective
             'TurnAroundTime'  :[ '(\S+) '     , 1 ,'integer'  ], # The number of milliseconds spent processing the request
             'Referer'         :[ '(\S+) '     , 1 ,'text'     ], # The value of the HTTP Referer header, if present
             'UserAgent'       :['"(.*?)" '    , 1 ,'text'     ], # The value of the HTTP User Agent header
             'VersionId'       :[ '(\S+)'      , 0 ,'text'     ]} # The version ID in the request (if present)

    # Log Sample
    #data0   = 'DBC7A89150F1972C80D190B8EEC0E5C2EF12890FDEEC26925D814A22700DA0D2 je-pds [09/Nov/2022:04:26:17 +0000] 223.29.216.242 UMBMGTD7FNQ3EOOVYOFE9 0713F33F7B1B4D5E REST.HEAD.OBJECT cog%2Fv1%2Fcatalog.json "HEAD /cog/v1/catalog.json" 200 - - 1395 914 152 "" "MSP360/Wasabi Explorer/6.2.2" -'
    #regexp0 = '(.{64}) (\\S+) \\[(.{26})\\] (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) "(\\S+ \\S+)" (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) (\\S+) "(.*?)" (\\S+)'

    # Read log
    f = open(fn,'r')
    data = f.read()
    f.close()

    # Cut description
    data = data[350:]

    # Replase all "0" to 0 (for BytesSent, TurnAroundTime)
    data = data.replace('"0"','0')

    # Extract data by regular expression 
    regexp  = "".join([i[0] for i in list(logs.values())])
    datatmp = re.findall(re.compile(regexp),data)
    datatmp = np.array(datatmp)

    # Select by settings
    idx = np.array([i[1] for i in list(logs.values())])
    idx = np.where(idx == 1)[0]
    datatmp = np.array(datatmp[:,idx])

    # Detect data's model names
    key_names = np.array(list(logs.keys()))
    key_names = key_names[idx]

    # Detect data's model types
    key_types = np.array([i[2] for i in list(logs.values())])
    key_types = key_types[idx]

    # Define models
    models = np.array([key_names,key_types]).transpose().tolist()

    # Convert timestamp to standard datetime format
    idx_ts  = np.where(key_names == "TimeStamp")[0][0]
    dfmt    = "%d/%b/%Y:%H:%M:%S %z"
    datetmp = [datetime.datetime.strptime(item,dfmt) for item in datatmp[:,idx_ts]]
    datatmp[:,idx_ts] = np.array(datetmp)

    # Convert "-" to zero
    idx_bytes = np.where(key_names == "BytesSent")[0][0]
    bytestmp  = [i.replace("-","0") for i in datatmp[:,idx_bytes]]
    datatmp[:,idx_bytes] = np.array(bytestmp)

    # Convert outlier(ex. -9223372036854) to zero
    idx_tatime = np.where(key_names == "TurnAroundTime")[0][0]
    tatimetmp  = datatmp[:,idx_tatime]
    for i in range(len(tatimetmp)):
        if "-" in tatimetmp[i]:
            tatimetmp[i] = "0"
    datatmp[:,idx_tatime] = np.array(tatimetmp)
    
    # Detect bucket, table names
    bucket      = Params.BUCKET_PDS
    bucket_tmp  = bucket.replace("-","_")
    table_names = [f"{bucket_tmp}_{i.year}" for i in datetmp]
    
    # Parse url key to additional model's data
    models_add,data_add,idx_ok = gen_add_data(fn,key_names,datatmp,Params)

    # Delete Key information in models and data if pds(not stac)
    if bucket in Params.BUCKET_PDS:
        idx_key = np.where(key_names == "Key")[0][0]
        datatmp = np.delete(datatmp,idx_key,1)
        models.pop(idx_key)

    # Merge additional models and data
    models.extend(models_add)
    dataout = np.concatenate([datatmp[idx_ok,:],data_add],axis=1)

    # return
    return bucket,table_names,models,dataout

# Generate addional data by key
def gen_add_data(fn,key_names,data,Params):

    # Check bucket's name in file name
    if Params.BUCKET_PDS in fn:

        # Additional Model
        models = [["FileType"       ,"text"],
                  ["Collection"     ,"text"],
                  ["ProductDate"    ,"text"],
                  ["COGLevel"       ,"real"],       
                  ["LatLongWidth"   ,"real"],
                  ["LongitudeCenter","real"],
                  ["LatitudeCenter" ,"real"],
                  ["Band"           ,"text"]]        

        # Select key of url
        idx_key = np.where(key_names == "Key")[0][0]
        key_url = data[:,idx_key]

        # Extract only tif/tiff
        idx_ok1 = np.array([i.endswith('.tif' ) for i in key_url]) # endswith ".tif"  in url
        idx_ok2 = np.array([i.endswith('.tiff') for i in key_url]) # endswith ".tiff" in url
        idx_ok  = idx_ok1 | idx_ok2

        # Check empty
        if len(key_url[idx_ok]) == 0:
            data_add = np.full([0,len(models)],0)#np.nan)
        else:
            data_add = url2values(key_url[idx_ok])

    else:

        # Additional model
        models = []

        # zero (or NaN) 
        data_add = np.full([len(data),len(models)],0)#np.nan)

        # idx_ok
        idx_ok   = np.full(len(data),True)

    # Return
    return models, data_add, idx_ok

# Parse url 2 various names
def url2values(key_url):

    # Replace %2F to /
    key_url = np.array([i.replace("%2F","/") for i in key_url],dtype=object)

    # Parse
    keytmp = np.array([i.split("/") for i in key_url],dtype=object)

    # file type (erase ")
    ftype = np.array([[i[0].replace('"','') for i in keytmp]]).transpose()

    # collections
    cols = np.array([[re.findall(".*/(.*_.*_.*_.*?)/.*",i)[0] for i in key_url]]).transpose()

    # cog levels
    clev = np.array([[re.findall("/(\d?)/",i)[0] for i in key_url]]).transpose()

    # dates
    dates = np.array([[re.findall(f"{cols[i][0]}/(.*)/{clev[i][0]}/",key_url[i])[0] for i in range(len(key_url))]]).transpose()

    # file names (erase ")
    fnames = np.array([[str(i[-1]).replace('"','') for i in keytmp]]).transpose()

    # fnames to lat lon
    data_add_latlon = fn2latlon(fnames)

    # finalization
    data_add = np.concatenate([ftype,cols,dates,clev,data_add_latlon],axis=1)

    # Return
    return data_add

# Parse fname 2 latlon
def fn2latlon(fnames):

    # Get file name and parse again
    fntmp  = np.array([i[0].split("-") for i in fnames])

    # Get lat, lon value
    latlon = np.zeros(fntmp.shape)
    latlon = latlon[:,0:4]
    for i in range(len(fntmp)):
        for j in range(len(fntmp[i,:])-1):
            dtmp = fntmp[i,j][0]
            if  dtmp in "MSW": # Minus, South, West
                plusminus = -1
            else:
                plusminus = 1
            latlon[i,j] = plusminus*(float(fntmp[i,j][1:]))

    # Calculate latlon center
    llw  = np.array([latlon[:,2]-latlon[:,0]]).transpose()
    latc = np.array([np.nanmean(latlon[:,1::2],axis=1)]).transpose()
    lonc = np.array([np.nanmean(latlon[:,0::2],axis=1)]).transpose()

    # Get band
    band = np.array([i.split(".") for i in fntmp[:,-1]])
    band = np.array([band[:,0]]).transpose()

    # Final merge
    data_add = np.concatenate([llw,lonc,latc,band],axis=1)

    # Return
    return data_add

