[英]Consolidate multiple tsv files from a folder into one using python
I have mutiple tsv files in a folder like 2018Q1.tsv, 2018Q2.tsv, 2018Q3.tsv and so on.我在 2018Q1.tsv、2018Q2.tsv、2018Q3.tsv 等文件夹中有多个 tsv 文件。 Each tuple inside the tsv file is seperated by '\t' and each row is seperated by '\n'.
tsv 文件中的每个元组由“\t”分隔,每一行由“\n”分隔。
I want to consolidate all the tsv files inside a folder into one single file with all the rows including the filename as a new column in a new consolidated file in python.我想将文件夹内的所有 tsv 文件合并到一个文件中,所有行包括文件名作为 python 中新合并文件中的新列。
import os
import pandas as pd
#read the path
cwd = os.path.abspath(r'path/to/directory')
#list all the files from the directory
file_list = os.listdir(cwd)
file_list
columns = ['CIK_number', 'Companyname', 'FilingType', 'Filingdate', 'filingtext', 'filingurl']
for file in file_list:
(pd.concat({f.rsplit('.')[0]: pd.read_csv(f, sep='|', header=None, names=columns)
for f in file_list}, names=['Date'])
.reset_index(0)
.to_csv('output_file.tsv', index=False)
)
Sample Input:示例输入:
2018Q1.tsv
------------
860585|RBS PARTNERS L P /CT|13FCONP|1993-02-11|edgar/data/860585/9999999997-04-035713.txt|edgar/data/860585/9999999997-04-035713-index.html
2018Q2.tsv
-------------
926688|SMITH THOMAS W|13F-HR|1993-02-12|edgar/data/926688/9999999997-05-015654.txt|edgar/data/926688/9999999997-05-015654-index.html
Sample consolidated output:
---------------
Date,CIK_number,Companyname,FilingType,Filingdate,filingtext,filingurl
2018Q1,860585,RBS PARTNERS L P /CT,13FCONP,1993-02-11,edgar/data/860585/9999999997-04-035713.txt,edgar/data/860585/9999999997-04-035713-index.html
2018Q2,926688,SMITH THOMAS W,13F-HR,1993-02-12,edgar/data/926688/9999999997-05-015654.txt,edgar/data/926688/9999999997-05-015654-index.html
FileNotFoundError Traceback (most recent call last)
Input In [25], in <cell line: 3>()
1 columns = ['CIK_number', 'Companyname', 'FilingType', 'Filingdate', 'filingtext', 'filingurl']
----> 3 pd.concat({f.rsplit('.')[0]: pd.read_csv(f, sep='|', header=None, names=columns) for f in file_list}, names=['Date']).reset_index(0).to_csv('output_file.tsv', index=False)
Input In [25], in <dictcomp>(.0)
1 columns = ['CIK_number', 'Companyname', 'FilingType', 'Filingdate', 'filingtext', 'filingurl']
----> 3 pd.concat({f.rsplit('.')[0]: pd.read_csv(f, sep='|', header=None, names=columns) for f in file_list}, names=['Date']).reset_index(0).to_csv('output_file.tsv', index=False)
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\util\_decorators.py:311, in deprecate_nonkeyword_arguments.<locals>.decorate.<locals>.wrapper(*args, **kwargs)
305 if len(args) > num_allow_args:
306 warnings.warn(
307 msg.format(arguments=arguments),
308 FutureWarning,
309 stacklevel=stacklevel,
310 )
--> 311 return func(*args, **kwargs)
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:680, in read_csv(filepath_or_buffer, sep, delimiter, header, names, index_col, usecols, squeeze, prefix, mangle_dupe_cols, dtype, engine, converters, true_values, false_values, skipinitialspace, skiprows, skipfooter, nrows, na_values, keep_default_na, na_filter, verbose, skip_blank_lines, parse_dates, infer_datetime_format, keep_date_col, date_parser, dayfirst, cache_dates, iterator, chunksize, compression, thousands, decimal, lineterminator, quotechar, quoting, doublequote, escapechar, comment, encoding, encoding_errors, dialect, error_bad_lines, warn_bad_lines, on_bad_lines, delim_whitespace, low_memory, memory_map, float_precision, storage_options)
665 kwds_defaults = _refine_defaults_read(
666 dialect,
667 delimiter,
(...)
676 defaults={"delimiter": ","},
677 )
678 kwds.update(kwds_defaults)
--> 680 return _read(filepath_or_buffer, kwds)
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:575, in _read(filepath_or_buffer, kwds)
572 _validate_names(kwds.get("names", None))
574 # Create the parser.
--> 575 parser = TextFileReader(filepath_or_buffer, **kwds)
577 if chunksize or iterator:
578 return parser
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:933, in TextFileReader.__init__(self, f, engine, **kwds)
930 self.options["has_index_names"] = kwds["has_index_names"]
932 self.handles: IOHandles | None = None
--> 933 self._engine = self._make_engine(f, self.engine)
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\parsers\readers.py:1217, in TextFileReader._make_engine(self, f, engine)
1213 mode = "rb"
1214 # error: No overload variant of "get_handle" matches argument types
1215 # "Union[str, PathLike[str], ReadCsvBuffer[bytes], ReadCsvBuffer[str]]"
1216 # , "str", "bool", "Any", "Any", "Any", "Any", "Any"
-> 1217 self.handles = get_handle( # type: ignore[call-overload]
1218 f,
1219 mode,
1220 encoding=self.options.get("encoding", None),
1221 compression=self.options.get("compression", None),
1222 memory_map=self.options.get("memory_map", False),
1223 is_text=is_text,
1224 errors=self.options.get("encoding_errors", "strict"),
1225 storage_options=self.options.get("storage_options", None),
1226 )
1227 assert self.handles is not None
1228 f = self.handles.handle
File C:\ProgramData\Anaconda3\lib\site-packages\pandas\io\common.py:789, in get_handle(path_or_buf, mode, encoding, compression, memory_map, is_text, errors, storage_options)
784 elif isinstance(handle, str):
785 # Check whether the filename is to be opened in binary mode.
786 # Binary mode does not support 'encoding' and 'newline'.
787 if ioargs.encoding and "b" not in ioargs.mode:
788 # Encoding
--> 789 handle = open(
790 handle,
791 ioargs.mode,
792 encoding=ioargs.encoding,
793 errors=errors,
794 newline="",
795 )
796 else:
797 # Binary mode
798 handle = open(handle, ioargs.mode)
FileNotFoundError: [Errno 2] No such file or directory: '1993-QTR1.tsv'
I am not able to consolidate.我无法巩固。 Please help
请帮忙
Thank you谢谢
You can use a list comprehension andconcat
, chained with to_csv
:您可以使用列表理解和
concat
,与to_csv
链接:
(pd.concat([pd.read_csv(f, sep='\t') for f in file_list],
ignore_index=True # optional, if you want to keep the index
)
.to_csv('output_file.tsv', sep='\t',
index=False # optional, if you don't want the index in the output
)
)
file_list = ['2018Q1.tsv', '2018Q2.tsv']
columns = ['CIK_number', 'Companyname', 'FilingType', 'Filingdate', 'filingtext', 'filingurl']
(pd.concat({f.rsplit('.')[0]: pd.read_csv(f, sep='|', header=None, names=columns)
for f in file_list}, names=['Date'])
.reset_index(0)
.to_csv('output_file.tsv', index=False)
)
Output file: Output 档案:
Date,CIK_number,Companyname,FilingType,Filingdate,filingtext,filingurl
2018Q1,860585,RBS PARTNERS L P /CT,13FCONP,1993-02-11,edgar/data/860585/9999999997-04-035713.txt,edgar/data/860585/9999999997-04-035713-index.html
2018Q2,926688,SMITH THOMAS W,13F-HR,1993-02-12,edgar/data/926688/9999999997-05-015654.txt,edgar/data/926688/9999999997-05-015654-index.html
import pandas as pd
from glob import glob
data_files = sorted(glob(r'path/to/directory/*.tsv'))
data_files
columns = ['CIK_number', 'Companyname', 'FilingType', 'Filingdate', 'filingtext', 'filingurl']
mergedata = pd.concat(pd.read_csv(datafile, sep = '|',header=None, names=columns).assign(Year_Quater = datafile)
for datafile in data_files)
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.