[英]Read real-time data inserted as chunks
我有一个Python脚本,它每天使用Task Scheduler运行,读取不断增长的日志文件(文本文件)并将数据插入Postgresql DB。 每天生成新的日志文件。 每个日志的大约大小为1GB。
我在Tunning PostgreSQL上搜索以处理繁重的I / O,这是我修改的内容:
shared_buffers: 8GB
work_mem: 100 MB
maintenance_work_mem: 512 MB
checkpoint_segments: 100
checkpoint_timepot: 1hr
synchronous_commit = off
full_page_writes = off
fsync = off
编写脚本以逐行读取日志文件并将其插入数据库:
import psycopg2 as psycopg
try:
connectStr = "dbname='postgis20' user='postgres' password='' host='localhost'"
cx = psycopg.connect(connectStr)
cu = cx.cursor()
logging.info("connected to DB")
except:
logging.error("could not connect to the database")
import time
file = open('textfile.log', 'r')
while 1:
where = file.tell()
line = file.readline()
if not line:
time.sleep(1)
file.seek(where)
else:
print line, # already has newline
dodecode(line)
def dodecode(fields):
global cx
from time import strftime, gmtime
from calendar import timegm
import os
msg = fields.split(',')
part = eval(msg[2])
msgnum = int(msg[3:6])
print "message#:", msgnum
print fields
if (part==1):
if msgnum==1:
msg1 = msg_1.decode(bv)
#print "message1 :",msg1
Insert(msgnum,time,msg1)
elif msgnum==2:
msg2 = msg_2.decode(bv)
#print "message2 :",msg2
Insert(msgnum,time,msg2)
elif msgnum==3:
....
....
....
def Insert(msgnum,time,msg):
global cx
try:
if msgnum in [1,2,3]:
if msg['type']==0:
cu.execute("INSERT INTO table1 ( messageid, timestamp, userid, position, text ) SELECT "+str(msgnum)+", '"+time+"', "+str(msg['UserID'])+", ST_GeomFromText('POINT("+str(float(msg['longitude']), '"+text+"')+" "+str(float(msg['latitude']))+")']))+" WHERE NOT EXISTS (SELECT * FROM table1 WHERE timestamp='"+time+"' AND text='"+text+"';")
cu.execute("INSERT INTO table2 ( field1,field2,field3, time_stamp, pos,) SELECT "+str(msg['UserID'])+","+str(int(msg['UserName']))+","+str(int(msg['UserIO']))+", '"+time+"', ST_GeomFromText('POINT("+str(float(msg['longitude']))+" "+str(float(msg['latitude']))+")')," WHERE NOT EXISTS (SELECT * FROM table2 WHERE field1="+str(msg['UserID'])+");")
cu.execute("Update table2 SET field3='"+str(int(msg['UserIO']))+"',time_stamp='"+str(time)+"',pos=ST_GeomFromText('POINT("+str(float(msg['longitude']))+" "+str(float(msg['latitude']))+")'),"' WHERE field1='"+str(msg['UserID'])+"' AND time_stamp < '"+str(time)+"';")
elif msg['type']==1:
cu.execute("INSERT INTO table1 ( messageid, timestamp, userid, position, text ) SELECT "+str(msgnum)+", '"+time+"', "+str(msg['UserID'])+", ST_GeomFromText('POINT("+str(float(msg['longitude']), '"+text+"')+" "+str(float(msg['latitude']))+")']))+" WHERE NOT EXISTS (SELECT * FROM table1 WHERE timestamp='"+time+"' AND text='"+text+"';")
cu.execute("INSERT INTO table2 ( field1,field2,field3, time_stamp, pos,) SELECT "+str(msg['UserID'])+","+str(int(msg['UserName']))+","+str(int(msg['UserIO']))+", '"+time+"', ST_GeomFromText('POINT("+str(float(msg['longitude']))+" "+str(float(msg['latitude']))+")')," WHERE NOT EXISTS (SELECT * FROM table2 WHERE field1="+str(msg['UserID'])+");")
cu.execute("Update table2 SET field3='"+str(int(msg['UserIO']))+"',time_stamp='"+str(time)+"',pos=ST_GeomFromText('POINT("+str(float(msg['longitude']))+" "+str(float(msg['latitude']))+")'),"' WHERE field1='"+str(msg['UserID'])+"' AND time_stamp < '"+str(time)+"';")
elif msg['type']==2:
....
....
....
问题是,在运行脚本6个小时之后,它会插入5分钟的文件数据! 我怀疑数据正在以块而不是行的形式流到日志文件中,但是我真的不知道如何解决这个问题,使其更像数据库中的实时数据。
您是否考虑过使用psycopg2的executemany ? 这样的答案简单的例子:
namedict = ({"first_name":"Joshua", "last_name":"Drake"},
{"first_name":"Steven", "last_name":"Foo"},
{"first_name":"David", "last_name":"Bar"})
cur = conn.cursor()
cur.executemany("""INSERT INTO bar(first_name,last_name) VALUES (%(first_name)s, %(last_name)s)""", namedict)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.