简体   繁体   English

使用 python 将复杂的 txt 转换为 csv

[英]Convert complex txt to csv with python

I want to convert text to csv. Input file contains 10000K lines.我想将文本转换为 csv。输入文件包含 10000K 行。 Sample of input file is as below:-输入文件的样本如下:-

Item=a
Price=10
colour=pink


Item=b
Price=20
colour=blue Pattern=checks

My output should look like this我的 output 应该是这样的

Item Price Colour Pattern

a    10    pink
b    20    blue   checks

My code is as below我的代码如下

import csv
import glob
import os

def dat_to_csv(filename, table_name):
    with open(filename, 'r',errors='ignore') as reader:
        list_of_columns = []
        table_values = []
        master_table = []
        counter = 0
        column_name1 = []
        value1 = []
        column_name2 = []
        value2 = []
        column_name3 = []
        value3 = []
        column_name4 = []
        value4 = []
        lines_after_23 = reader.readlines()[23:]
        for line in lines_after_23:
        #stripped_line = line.strip()
            if line.startswith("#"):
                continue
            if line.startswith("Associate"):
                continue
            if line == "\n":
                if (table_values):
                    master_table.append([])
                    master_table[counter] = table_values.copy()
                    counter = counter + 1
                    length = len(table_values)
                    for element in range(length):
                        table_values[element] = []
                continue 
            if line == "\n":
                continue

            if line == "\n":
                master_table.append([])
                master_table[counter] = table_values.copy()
                counter = counter + 1
                length = len(table_values)
                for element in range(length):
                    table_values[element] = []
                break

            extra_stripped_line = ' '.join(line.split())
             

            data = extra_stripped_line.split("=",1)
            column_name = data[0].strip()
            if '=' in data[1].strip():
                data1 = data[1].split(" ",1)
                value = data1[0].strip()
                data2 = data1[1].split("=",1)
                column_name1 = data2[0].strip()
                
                if '=' in data2[1].strip():
                    column_name2 = []
                    value2 = []
                    data3 = data2[1].split(" ",1)
                    value1 = data3[0].strip()
                    data4 = data3[1].split("=",1)
                    column_name2 = data4[0].strip()
                    
                    if '=' in data4[1].strip():
                        data5 = data4[1].split(" ",1)
                        value2 = data5[0].strip()
                        data6 = data5[1].split("=",1)
                        column_name3 = data6[0].strip()
                        
                        if '=' in data6[1].strip():
                            data7 = data6[1].split(" ",1)
                            value3 = data7[0].strip()
                            data8 = data7[1].split("=",1)
                            column_name4 = data8[0].strip()
                            
                            if '=' in data8[1].strip():
                                data9 = data8[1].split(" ",1)
                                value3 = data9[0].strip()
                                data10 = data9[1].split("=",1)
                                column_name4 = data10[0].strip()
                                value4 = data10[1].strip()
                            else:
                                value4 = data8[1].strip()
                        else:
                            value3 = data6[1].strip()
                        
                    else:
                        value2 = data4[1].strip()
                    
                else:
                    value1 = data2[1].strip()
            else:
                value = data[1].strip()  
                                
            if column_name not in list_of_columns:
                list_of_columns.append(column_name)
                table_values.append([])
                
            if column_name1 is not []:
                if column_name1 not in list_of_columns:
                    list_of_columns.append(column_name1)
                    table_values.append([])
                
            if column_name2 is not []:
                if column_name2 not in list_of_columns:
                    list_of_columns.append(column_name2)
                    table_values.append([]) 
                
            if column_name3 is not []:
                if column_name3 not in list_of_columns:
                    list_of_columns.append(column_name3)
                    table_values.append([]) 
                
            if column_name4 is not []:
                if column_name4 not in list_of_columns:
                    list_of_columns.append(column_name4)
                    table_values.append([])       

            index = list_of_columns.index(column_name)
            if column_name1 is not []:
                index1 = list_of_columns.index(column_name1)
            if column_name2 is not []:
                index2 = list_of_columns.index(column_name2)
            if column_name3 is not []:
                index3 = list_of_columns.index(column_name3)
            if column_name4 is not []:
                index4 = list_of_columns.index(column_name4)

            #table_values[index].append(value)
            table_values[index] = value
            if value1 is not []:
                table_values[index1] = value1
            if value2 is not []:
                table_values[index2] = value2
            if value3 is not []:
                table_values[index3] = value3
            if value4 is not []:
                table_values[index4] = value4
             
        #with open("output\\{}.csv".format(table_name), 'w', newline='') as csvfile:
        with open("yourpath\\{}.csv".format(table_name), 'w', newline='') as csvfile:
            writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_ALL)
            writer.writerow(list_of_columns)

            #t_table_values = zip(*table_values)
            max_elements = len(master_table)
            master_table_transp = []
            cntr = 0
            for cntr in range(max_elements):
                master_table_transp.append([])
            num_objects = len(master_table)

            for cntr_obj in range(num_objects):
                for cntr_row in range(max_elements):
                    if (cntr_row<len(master_table[cntr_obj])):
                        master_table_transp[cntr_row].append(master_table[cntr_obj][cntr_row])
                    else:
                        master_table_transp[cntr_row].append([])
                        
                   
            
            t_table_values = zip(*master_table_transp)
                       
            for values in t_table_values:
                writer.writerow(values)
                
            if value1 is not []:
                for value1s in t_table_values:
                    writer.writerow(value1s)
                
            if value2 is not []:
                    for value2s in t_table_values:
                        writer.writerow(value2s)
                
            if value3 is not []:
                for value3s in t_table_values:
                    writer.writerow(value3s)
                
            if value4 is not []:
                for value4s in t_table_values:
                    writer.writerow(value4s) 
                
                    
if __name__ == '__main__':
    path = "your path" 
    for filename in glob.glob((os.path.join(path, '*.dat'))):
        name_only = os.path.basename(filename).replace(".dat", "")
        dat_to_csv(filename, name_only)

I am getting required output but there are few issues:-我需要 output 但有几个问题:-

  1. a blank column at 2nd position is being created正在创建 2nd position 的空白列
  2. last item "item=n" is not showing in my csv. rest all items showing最后一项“item=n”未显示在我的 csv 中。rest 显示所有项目

Unless I've misunderstood something then it's as simple as this:除非我误解了什么,否则它就像这样简单:

from pandas import DataFrame
from numpy import nan

master = [dict()]

with open('foo.txt') as foo:
    for line in foo:
        if (line := line.strip()):
            for token in line.split():
                k, v = token.split('=')
                master[-1][k] = v
        elif master[-1]:
            master.append(dict())

if not master[-1]:
    del master[-1]

if master:
    df = DataFrame(master).replace(nan, '', regex=True)
    df.to_csv('foo.csv', index=False)

Output (of the csv file): Output(csv 文件):

Item,Price,colour,Pattern
a,10,pink,
b,20,blue,checks

With some assumptions this works.有了一些假设,这是可行的。 I added some test cases.我添加了一些测试用例。 This does require all the records to fit in memory, but if you know all the possible column names in advance you can set columns accordingly and write the lines as generated instead of all at the end.这确实需要所有记录都适合 memory,但如果您事先知道所有可能的列名,则可以相应地设置columns并将行写为生成而不是最后。 Even with 10000K (10M) records unless the records are really large that will easily fit in a modern system memory.即使有 10000K (10M) 条记录,除非记录真的很大,很容易适应现代系统 memory。

input.csv输入.csv

Item=a
Price=10

Item=b
Price=20
colour=blue Pattern=checks

Item=c
Price=5

Item=d Price=25 colour=blue

Item=e colour===FANCY== Price=1/2=$1

test.py测试.py

from collections import defaultdict
import csv

columns = {}
lines = []

with open('input.txt') as fin:
    for line in fin:
        if not line.strip(): # write record on blank line
            needs_flush = False
            lines.append(columns)
            # blank all the columns to start next record.
            columns = {k:'' for k in columns}
            continue

        # assume multiple items on a line are separated by a single space
        items = line.strip().split(' ')
        
        # assume column name is before first = sign in each item
        for column,value in [item.split('=',1) for item in items]:
            needs_flush = True
            columns[column] = value

    # write record on EOF if hasn't been flushed
    if needs_flush:
        lines.append(columns)

# dump records to CSV
with open('output.csv','w',newline='') as fout:
    writer = csv.DictWriter(fout, fieldnames=columns)
    writer.writeheader()
    writer.writerows(lines)

output.csv: output.csv:

Item,Price,colour,Pattern
a,10,,
b,20,blue,checks
c,5,,
d,25,blue,
e,1/2=$1,==FANCY==,

I like to make little state machines for these kinds of problems, since as much as we'd all like to believe the sample data matches the real world, there are probably some gotchas, and you want a solution that's flexible.我喜欢为这类问题制作小型 state 机器,因为尽管我们都愿意相信样本数据与现实世界相匹配,但可能存在一些陷阱,并且您需要一个灵活的解决方案。

For me, that flexibility means as I'm looping over the input lines:对我来说,这种灵活性意味着当我在输入行上循环时:

  • avoid many nested structures避免许多嵌套结构
  • do all processing in one place;在一个地方进行所有处理; at the end of the loop, once I know this line is worth processing在循环结束时,一旦我知道这一行值得处理
  • for processing, try to break down the steps and still keep a flat/linear flow...对于处理,尝试分解步骤并仍然保持平坦/线性流程......
    • how do I go from a line to fields?我如何将 go 从一行到字段?
    • how do I go from a field to a key/value pair?如何将 go 从字段转换为键/值对?
    • how do I use the key-value pair?如何使用键值对?

And as you try it out, try starting out with smaller samples of your input and build up to the whole, big thing.当你尝试它时,试着从你的输入的较小样本开始,然后逐步建立一个完整的、大的事物。

#!/usr/bin/env python3
import csv

field_names = {}  # use dict as ordered set to collect all field names as data is parsed
records = []
record = None
with open('input.txt') as f:
    for line in f:
        line = line.strip()

        if line.startswith('Item'):
            record = {}

        if record is None:
            continue

        if line == '':
            records.append(record)
            record = None
            continue

        # Finally, line must be data in a record, parse it
        fields = line.split(' ')
        kvps = [field.split('=', 1) for field in fields]  # 1 in split('=', 1) is for the `===FANCY==` example @MarkTolonen threw at us
        kvp_dict = dict(kvps)

        record.update(kvp_dict)
        field_names.update(kvp_dict)                      # pass in keys & vals (it's simpler) even if we only need the keys



# Deal with "straggling record" (if your input ends with a line of data (and not an empty line))
if record is not None:
    records.append(record)


out_f = open('output.csv', 'w', newline='')
writer = csv.DictWriter(out_f, fieldnames=field_names)
writer.writeheader()
writer.writerows(records)

Here's my output.csv:这是我的 output.csv:

| Item | Price  | colour    | Pattern |
|------|--------|-----------|---------|
| a    | 10     | pink      |         |
| b    | 20     | blue      | checks  |
| aa   | 10     |           |         |
| bb   | 20     | blue      | checks  |
| cc   | 5      |           |         |
| dd   | 25     | blue      |         |
| ee   | 1/2=$1 | ==FANCY== |         |

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM