[英]I would like to split a huge file into many number of files with the header in all split files. Using python
"SURNAME","GIVENNAME","MIDDLENAME","UPIN","NAME","CODE"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770"
"ADU", "GOU","RAN", "3cxd", "GOU RAN", "0770
Let's suppose this is the format of the huge file and I would like to split into many number of files with specified size and in each file I need the header ("SURNAME","GIVENNAME","MIDDLENAME","UPIN","NAME","CODE") to be present.Thanks in advance. 让我们假设这是大文件的格式,我想分割成许多具有指定大小的文件,并且在每个文件中我都需要标题(“ SURNAME”,“ GIVENNAME”,“ MIDDLENAME”,“ UPIN”,“ NAME”,“ CODE”)。请先感谢。
import os
import sys
def getfilesize(filename):
with open(filename,"rb") as fr:
fr.seek(0,2) # move to end of the file
size=fr.tell()
print("getfilesize: size: %s" % size)
return fr.tell()
def splitfile(filename, splitsize):
# Open original file in read only mode
if not os.path.isfile(filename):
print("No such file as: \"%s\"" % filename)
return
filesize=getfilesize(filename)
with open(filename,"rb") as fr:
counter=1
orginalfilename = filename.split(".")
readlimit = 1000000 #read 5kb at a time
n_splits = filesize//splitsize
print("splitfile: No of splits required: %s" % str(n_splits))
for i in range(n_splits+1):
chunks_count = int(splitsize)//int(readlimit)
data_5kb = fr.read(readlimit) # read
# Create split files
print("chunks_count: %d" % chunks_count)
with open(orginalfilename[0]+"_{id}.".format(id=str(counter))+orginalfilename[1],"ab") as fw:
fw.seek(0)
fw.truncate()# truncate original if present
while data_5kb:
fw.write(data_5kb)
if chunks_count:
chunks_count-=1
data_5kb = fr.read(readlimit)
else: break
counter+=1
if __name__ == "__main__":
if len(sys.argv) < 3: print("Filename or splitsize not provided: Usage: filesplit.py filename splitsizeinkb ")
else:
filesize = int(sys.argv[2]) * 1000 #make into kb
filename = sys.argv[1]
splitfile(filename, filesize)
This works fine but couldn't get the Headers and I'm sorry I'm new to Stackoverflow. 这工作正常,但无法获取标题,对不起,我是Stackoverflow的新手。
I have used pandas to split large files into smaller ones 我用熊猫将大文件分割成较小的文件
import pandas as pd
infile = #path to your file
n=0
for chunk in pd.read_csv(infile, sep = ',', chunksize=1000000):
data = chunk
oPath = 'chunk_' +str(n)+'.csv'
data.to_csv(oPath, sep=' ',index=False, header=true)
n +=1
chunksize
indicates how many lines you want in the output files . chunksize
指示输出文件中需要多少行。
This should do it 这应该做
import os
maxlines = 1000 # how many lines did you want each new file to have?
infilepath = 'path/to/file'
with open(infilepath) as infile:
dirpath = os.path.dirname(infilepath)
fname = os.path.basename(infilepath)
fname, ext = fname.rsplit('.',1)
header = infile.readline()
outfile = open(os.path.join(dirpath, "{}{}.{}".format(fname, 0, ext)), 'w')
for i,line in enumerate(infile):
if not i%maxlines:
outfile.close()
outfile = open(os.path.join(dirpath, "{}{}.{}".format(fname, i//maxlines, ext)), 'w')
outfile.write(header)
outfile.write(line)
try: outfile.close()
except: pass
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.