[英]Why is last line duplicated when appending to a text file?
當我將 append 復制到文本文件中時,我的代碼正在復制最后一行。
一個示例輸入文件是
[headline - https://www.rfa.org/cantonese/news/us-pompeo-06012020072305.html]
This
some-text-here
[date - https://www.rfa.org/cantonese/news/us-pompeo-06012020072305.html]
https://www.rfa.org/cantonese/news/us-pompeo-06012020072305.html/US-Pompeo.mp3
[headline - https://www.rfa.org/cantonese/news/htm/tw-uk-06012020113435.html]
Is
some-text-here
[date - https://www.rfa.org/cantonese/news/htm/tw-uk-06012020113435.html]
https://www.rfa.org/cantonese/news/htm/tw-uk-06012020113435.html/tw-su.mp3
[headline - https://www.rfa.org/cantonese/news/wang-06012020103828.html]
Test
some-text-here
[date - https://www.rfa.org/cantonese/news/wang-06012020103828.html]
https://www.rfa.org/cantonese/news/wang-06012020103828.html/wang.mp3
[headline - https://www.rfa.org/cantonese/news/us-wang-06012020135251.html]
TEST
some-text-here
[headline - https://www.rfa.org/cantonese/news/htm/hk-chan-06012020073718.html]
TEST
some-text-here
[headline - https://www.rfa.org/cantonese/news/ear/ear-state-06012020035108.html]
TEST
some-text-here
[date - https://www.rfa.org/cantonese/news/ear/ear-state-06012020035108.html]
https://www.rfa.org/cantonese/news/ear/ear-state-06012020035108.html/EarState.mp3
[headline - https://www.rfa.org/cantonese/news/htm/hk-innocent-06012020114634.html]
TEST
some-text-here
但是,由於某種原因,output 文件總是在最后一行 append 兩次。 我得到這個ear-state-06012020035108.txt
。
TEST
some-text-here
some-text-here
你可以看到 some-text-here 被復制了。 我只是想將所有文本放入一個文本文件中。 但是,由於某種原因,它只在底部附加了最后一行兩次。
這是我的代碼。
import pandas as pd
import os
import re
import requests
def tabulate_headlines(text="", filename="", verbose=0):
"""Tabulates the information from either a text string (text) or from the
text read from a file (filename) and returns a dataframe.
Example:
# Case: from text
df = tabulate_headlines(text=s, filename="", verbose=1)
# Case: from a file
df = tabulate_headlines(text="", filename="input.txt", verbose=1)
"""
## Read text from input file
if not text and (filename is not None):
with open(filename, "r") as f:
text = f.read()
if text is not None:
## Define regex patterns for
# - headline-text and corresponding source-id
# - headline_url
# - date_url
# - mp3_url
# - source_id from (headline, date and mp3)
headline_text_pat = r"\n?\[headline - https://.*/(.*?)\.html\]\n((.*\n)+?)\n"
headline_pat = r".*\[headline - (https://.*?\.html?)[,\]]"
date_pat = r".*\[date - (https://.*?\.html?)[,\]]"
mp3_pat = r".*\n(https://.*?\.html/.*?\.mp3)\s*\n"
source_id_pat = r"https://.*/(.*?)\.html" # headline, date
source_id_pat_mp3 = r"https://.*/(.*?).html/.*?\.mp3" # mp3
## Compile regex-patterns for speed
headline_text_pat = re.compile(headline_text_pat)
headline_pat = re.compile(headline_pat)
date_pat = re.compile(date_pat)
mp3_pat = re.compile(mp3_pat)
source_id_pat = re.compile(source_id_pat)
source_id_pat_mp3 = re.compile(source_id_pat_mp3)
## Extract headlines (texts)
# and store in a pandas.Series object: headlines
headline_data = headline_text_pat.findall(text + '\n[')
headline_texts = []
source_ids = []
for headline in headline_data:
source_id, headline_text = headline[0], ''.join(headline[1:]).strip()
headline_texts.append(headline_text)
source_ids.append(source_id)
# Save as a pandas.Series object: headlines
headlines = pd.Series(data=headline_texts, index=source_ids)
## Extract the urls (for headline, date, mp3)
headline_urls = headline_pat.findall(text)
date_urls = date_pat.findall(text)
mp3_urls = mp3_pat.findall(text)
## Make temporary dataframes
df_headline = pd.DataFrame({'headline_url': headline_urls})
df_date = pd.DataFrame({'date_url': date_urls})
df_mp3 = pd.DataFrame({'mp3_url': mp3_urls})
## Process temporary dataframes to
df_headline['source_id'] = (df_headline['headline_url']
.str.replace(source_id_pat, r"\1", regex=True))
df_date['source_id'] = (df_date['date_url']
.str.replace(source_id_pat, r"\1", regex=True))
df_mp3['source_id'] = (df_mp3['mp3_url']
.str.replace(source_id_pat_mp3, r"\1", regex=True))
df_headline.set_index('source_id', inplace=True)
df_date.set_index('source_id', inplace=True)
df_mp3.set_index('source_id', inplace=True)
## Combine headlines, dates and mp3s together
df = pd.concat([df_headline, df_date, df_mp3], axis=1)
df['source_id'] = df.index
df['headline_text'] = headlines
df.reset_index(drop=True, inplace=True)
else:
df = None
if verbose>0:
print(df)
return df
def download_mp3(url, filename='out.mp3', output_dir=''):
if not filename.endswith('.mp3'):
filename += '.mp3'
r = requests.get(url, allow_redirects=True)
# update filename (add path if applicable)
filename = update_filename(filename=filename,
output_dir=output_dir)
# write to mp3 file
with open(filename, 'wb') as f:
f.write(r.content)
def write_headline(text, filename='out.txt', output_dir=''):
if not filename.endswith('.txt'):
filename += '.txt'
# update filename (add path if applicable)
filename = update_filename(filename=filename,
output_dir=output_dir)
# write to txt file
with open(filename, 'w') as f:
f.write(text)
def update_filename(filename, output_dir=''):
if output_dir:
output_dir = makedir(path=output_dir)
filename = os.path.join(output_dir, filename)
return filename
def makedir(path):
path = os.path.abspath(path)
if not os.path.exists(path):
os.makedirs(path)
return path
## Define output directory
OUTPUT_DIR_HEADLINES = 'output/headlines'
OUTPUT_DIR_AUDIO = 'output/audio'
## Extract data and tabulate as a pandas dataframe
df = tabulate_headlines(text="", filename="test.txt", verbose=1)
## Determine target rows
target_rows_headlines = ~df.headline_text.isna()
target_rows_mp3 = ~df.mp3_url.isna()
## Write headlines to .txt files
print('Total headlines: {}'.format(target_rows_headlines.sum()))
_ = (df
.loc[target_rows_headlines, ["source_id", "headline_text"]]
.apply(lambda row: write_headline(text = row['headline_text'],
filename = row['source_id'],
output_dir=OUTPUT_DIR_HEADLINES),
axis=1))
print('Writing headlines to .txt files: COMPLETE')
## Save audio to .mp3 files
print('Total audio files: {}'.format(target_rows_mp3.sum()))
_ = (df
.loc[target_rows_mp3, ["source_id", "mp3_url"]]
.apply(lambda row: download_mp3(url = row['mp3_url'],
filename = row['source_id'],
output_dir=OUTPUT_DIR_AUDIO),
axis=1))
print('Downloading audio to .mp3 files: COMPLETE')
問題在於您的正則表達式使headline_text_pat = r"\n?\[headline - https://.*/(.*?)\.html\]\n((.*\n)+?)\n"
如果您對示例文件運行此正則表達式,您將看到捕獲組 1 存儲 URl 的結尾,組 2 捕獲接下來的兩行,第三組捕獲這兩行中的最后一行
Match 1
Full match 0-99
[headline - https://www.rfa.org/cantonese/news/us-pompeo-06012020072305.html]
This
some-text-here
Group 1. 47-71
us-pompeo-06012020072305
Group 2. 78-98
This
some-text-here
Group 3. 83-98
some-text-here
所以問題不在於您在最后一行時多次附加到文本文件,問題在於您的源數據。
for headline in headline_data:
print("#", headline)
OUTPUT
# ('us-pompeo-06012020072305', 'This\nsome-text-here\n', 'some-text-here\n')
# ('tw-uk-06012020113435', 'Is\nsome-text-here\n', 'some-text-here\n')
# ('wang-06012020103828', 'Test\nsome-text-here\n', 'some-text-here\n')
# ('us-wang-06012020135251', 'TEST\nsome-text-here\n', 'some-text-here\n')
# ('hk-chan-06012020073718', 'TEST\nsome-text-here\n', 'some-text-here\n')
# ('ear-state-06012020035108', 'TEST\nsome-text-here\n', 'some-text-here\n')
因此,您可以看到由於您的正則表達式,您的數據中實際上有兩次“some-text-here”,因此您的 append 會按照您的要求執行並附加它,您需要查看您的正則表達式並更改它以停止接收這個。
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.