返回官网

python获取apache日志并推送数据到elasticsearch

狒狒 2023-5-19 python 152 次
import os
import re
import collections
from elasticsearch import Elasticsearch

def get_all_info(log):
    m = re.search(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', log)
    if m is not None:
        IP = m.group(1)

    else:
        IP = ""
    print('IP地址:', IP)
    sm = re.search(r'^.+? - ', log)
    if sm is not None:
        source = sm.group()[:-2]
    else:
        source = ""
#       print('来源:', source)
    dm = re.search(r'\"http|https?://.+?/\"', log)
    if dm is not None:
        domain = dm.group()[1:-1]
    else:
        domain = ""
#       print('访问域名:', domain)
    um = re.search(r'\"GET (.+?) ', log)
    if um is not None:
        url = um.group(1)
    else:
        url = ""
#       print('访问地址:', url)
    tm = re.search(r'\[.+?\]', log)
    if tm is not None:
        datetime = tm.group()[1:-1]
    else:
        datetime = ""
#       print('访问时间:', time)
    stm = re.search(r'HTTP/1.1\" (\d{3}) ', log)
    if stm is not None:
        status = stm.group(1)
    else:
        status = ""
#       print('页面打开状态:', status)
    bm = re.search(r'" (.+?)\"$', log)
    if bm is not None:
        browser = bm.group(1)
#               print(browser)
        browser_m = re.search(r'" "(.+?)$', browser)
        if browser_m is not None:
            browser = browser_m.group(1)
        else:
            browser= ''
    else:
        browser = ""
#       print('访问浏览器:', browser)
    cm = re.search(r'\[(.+?)\]', log)
    if cm is not None:
        client = cm.group(1)
#               print(client)
    else:
        client = ""
#       print('访问的客户端:', client)
    pattern = "\[FBAN\/(.*?);FBDV\/(.*?);FBMD\/(.*?);FBSN\/(.*?);FBSV\/(.*?);FBSS\/(.*?);FBID\/(.*?);FBLC\/(.*?);FBOP\/(.*?)\]"
    match = re.search(pattern, log)
    FBDATA = []
#                       print({"url":get_access_url(line),"ip":get_ip_address(line),"user":get_user_agent(line),"referer":get_referer(line)})
    if match:
        FBAN = match.group(1)
        FBDV = match.group(2)
        FBMD = match.group(3)
        FBSN = match.group(4)
        FBSV = match.group(5)
        FBSS = match.group(6)
        FBID = match.group(7)
        FBLC = match.group(8)
        FBOP = match.group(9)
        FBDATA = {'FBAN':FBAN,'FBDV':FBDV,'FBMD':FBMD,'FBSN':FBSN,'FBSV':FBSV,'FBSS':FBSS,'FBID':FBID,'FBLC':FBLC,'FBOP':FBOP}

    body = {"datetime":datetime,"url":url,"ip":IP,'status':status,"browser":browser,"client":client,"fb":FBDATA}
    return body;

# 监控access_log日志文件
filename = '/var/log/httpd/access_log'
file_mtime = None

es = Elasticsearch(hosts=[{'host':'137.184.90.17','port':9200,'scheme':'http'}])
offset = -10
with open(filename) as f:
    while True:
        new_mtime = os.stat(filename).st_mtime
#       lines = f.readlines()
        if file_mtime != new_mtime:
            f.seek(0,2)
            last_pos = f.tell()
            print(last_pos)
            f.seek(last_pos-800,0)
#           f.seek(0,1)
            lines = f.readline()
            print(lines)
    #       print('access_log has been updated')
            #line = ''
            #lines = f.readlines()
            #print(len(lines))
            line=''
            if len(lines)>1:
                print(len(lines))
#               f.seek(0,1)
                line = f.readlines()[-1]
            else:
#               offset *= 2
                f.seek(0)
                print('<=1')
#           last_pos = f.tell()
#           f.seek(last_pos)
#           line = f.readline()
#           print(line)
            bodyData = get_all_info(line)
            print(bodyData)
#           if bodyData != '':
            es.index(index="216service_access_log_url", body=bodyData)
            file_mtime = new_mtime

发表评论

Copyright © 2016 DEWEBSTUDIO