python获取apache日志并推送数据到elasticsearch

狒狒 2023-5-19 python 261 次
		import os
	
		import re
	
		import collections
	
		from elasticsearch import Elasticsearch
	
		def get_all_info(log):
	
		    m = re.search(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', log)
	
		    if m is not None:
	
		        IP = m.group(1)
	
		    else:
	
		        IP = "" 
	
		    print('IP地址：', IP)
	
		    sm = re.search(r'^.+? - ', log)
	
		    if sm is not None:
	
		        source = sm.group()[:-2]
	
		    else:
	
		        source = "" 
	
		#       print('来源：', source) 
	
		    dm = re.search(r'\"http|https?://.+?/\"', log)
	
		    if dm is not None:
	
		        domain = dm.group()[1:-1]
	
		    else:
	
		        domain = "" 
	
		#       print('访问域名：', domain) 
	
		    um = re.search(r'\"GET (.+?) ', log)
	
		    if um is not None:
	
		        url = um.group(1)
	
		    else:
	
		        url = "" 
	
		#       print('访问地址：', url) 
	
		    tm = re.search(r'\[.+?\]', log)
	
		    if tm is not None:
	
		        datetime = tm.group()[1:-1]
	
		    else:
	
		        datetime = "" 
	
		#       print('访问时间：', time) 
	
		    stm = re.search(r'HTTP/1.1\" (\d{3}) ', log)
	
		    if stm is not None:
	
		        status = stm.group(1)
	
		    else:
	
		        status = "" 
	
		#       print('页面打开状态：', status) 
	
		    bm = re.search(r'" (.+?)\"$', log)
	
		    if bm is not None:
	
		        browser = bm.group(1)
	
		#               print(browser) 
	
		        browser_m = re.search(r'" "(.+?)$', browser)
	
		        if browser_m is not None:
	
		            browser = browser_m.group(1)
	
		        else:
	
		            browser= '' 
	
		    else:
	
		        browser = "" 
	
		#       print('访问浏览器：', browser) 
	
		    cm = re.search(r'\[(.+?)\]', log)
	
		    if cm is not None:
	
		        client = cm.group(1)
	
		#               print(client) 
	
		    else:
	
		        client = "" 
	
		#       print('访问的客户端：', client) 
	
		    pattern = "\[FBAN\/(.*?);FBDV\/(.*?);FBMD\/(.*?);FBSN\/(.*?);FBSV\/(.*?);FBSS\/(.*?);FBID\/(.*?);FBLC\/(.*?);FBOP\/(.*?)\]" 
	
		    match = re.search(pattern, log)
	
		    FBDATA = []
	
		#                       print({"url":get_access_url(line),"ip":get_ip_address(line),"user":get_user_agent(line),"referer":get_referer(line)}) 
	
		    if match:
	
		        FBAN = match.group(1)
	
		        FBDV = match.group(2)
	
		        FBMD = match.group(3)
	
		        FBSN = match.group(4)
	
		        FBSV = match.group(5)
	
		        FBSS = match.group(6)
	
		        FBID = match.group(7)
	
		        FBLC = match.group(8)
	
		        FBOP = match.group(9)
	
		        FBDATA = {'FBAN':FBAN,'FBDV':FBDV,'FBMD':FBMD,'FBSN':FBSN,'FBSV':FBSV,'FBSS':FBSS,'FBID':FBID,'FBLC':FBLC,'FBOP':FBOP}
	
		    body = {"datetime":datetime,"url":url,"ip":IP,'status':status,"browser":browser,"client":client,"fb":FBDATA}
	
		    return body; 
	
		# 监控access_log日志文件 
	
		filename = '/var/log/httpd/access_log' 
	
		file_mtime = None 
	
		es = Elasticsearch(hosts=[{'host':'137.184.90.17','port':9200,'scheme':'http'}])
	
		offset = -10 
	
		with open(filename) as f:
	
		    while True:
	
		        new_mtime = os.stat(filename).st_mtime
	
		#       lines = f.readlines() 
	
		        if file_mtime != new_mtime:
	
		            f.seek(0,2)
	
		            last_pos = f.tell()
	
		            print(last_pos)
	
		            f.seek(last_pos-800,0)
	
		#           f.seek(0,1)  
	
		            lines = f.readline()
	
		            print(lines)
	
		    #       print('access_log has been updated') 
	
		            #line = '' 
	
		            #lines = f.readlines()  
	
		            #print(len(lines)) 
	
		            line='' 
	
		            if len(lines)>1:
	
		                print(len(lines))
	
		#               f.seek(0,1) 
	
		                line = f.readlines()[-1]
	
		            else:
	
		#               offset *= 2  
	
		                f.seek(0)
	
		                print('<=1')
	
		#           last_pos = f.tell()  
	
		#           f.seek(last_pos) 
	
		#           line = f.readline()  
	
		#           print(line) 
	
		            bodyData = get_all_info(line)
	
		            print(bodyData)
	
		#           if bodyData != '': 
	
		            es.index(index="216service_access_log_url", body=bodyData)
	
		            file_mtime = new_mtime
版权属于：BLOG DEWEBSTUDIO 本文作者：狒狒
原文地址： http://blog.dewebstudio.com/?post=126
继续浏览： python elasticsearch apache日志
上一篇：一次python3.10安装的过程
下一篇：openssl升级到3.1版本
python获取apache日志并推送数据到elasticsearch

发表评论

搜索

分类

最新文章

最新评论