[英]How to calculate time difference between specific row values in dataframe using python?
[英]how to calculate range between the dataframe values using python
在我的數據框中,有一列需求和結果。 所以我的目標是要得到一個邏輯,以便我可以確定需求中給出的范圍,並將其與結果進行比較,然后說出它的“確定”還是“不合適”。
這是我的excel屏幕截圖。
這只是excel的一小幅屏幕截圖。
因此,如果該值在該范圍內,則結果列將顯示為“ OK ”,如果該值不在該范圍內,則將顯示“ NOT OK ”
無論如何,我可以通過在python中使用pandas來完成此工作嗎? 請幫忙! 問我問題是否不清楚。
import pandas as pd
import numpy as np
import csv
from tabulate import tabulate
df = pd.read_csv('Form3.tsv',delimiter='\t')
df = df.loc[df['level'] == 5]
df = df.dropna(subset=['text'])
df = df.loc[df['top'] > 500]
df = df.loc[df['conf'] != 0]
df = df[df['text'].notnull()]
df = df[df['text'] != ' ']
column_separator = 0
distinct_pages = df.page_num.unique()
#print("Distinct pages in file are ",distinct_pages)
df.sort_values(['page_num','line_num','word_num'])
all_rows = []
for each_page in distinct_pages:
df_each_page = df.loc[df['page_num'] == each_page]
#print(df_each_page)
char_num = []
line_num = []
ref_loc = []
charateristic_designator = []
results = []
requirement_for_each_line = []
each_row = []
current_line_left = 0
previous_line_left = 1000
previous_line_width = 1000
previous_line_number = 1
line_flag = False
result_flag = False
requirement = []
resultText = ''
char_num_text = ''
ref_loc_text = ''
charateristic_designator_text = ''
for index, row in df_each_page.iterrows():
column_separator = 0
word_num = row['word_num']
text = row['text']
Minor_flag = False
requirement_flag = False
current_line_number = row['line_num']
if current_line_number == previous_line_number:
line_flag = False
if current_line_number != previous_line_number:
line_flag = True
previous_line_number = current_line_number
current_line_left = row['left']
column_separator = current_line_left - previous_line_left - previous_line_width
previous_line_left = current_line_left
previous_line_width = row['width']
if (len(results) and column_separator == 6) and 1250 > current_line_left > 1100:
result_flag = True
if line_flag is True:
result_flag = False
if len(requirement) and word_num in (1,2,3) and text != 'Note' and 800 > current_line_left > 500 and column_separator < 0:
requirement_flag = True
if len(requirement) and line_flag is True and (word_num == 1 or (word_num == 2 and column_separator < 0) or (word_num == 4 and column_separator > 300))and requirement_flag is False and result_flag is False and char_num_text != '' and ref_loc_text != '' and charateristic_designator_text != '':
each_row.append(char_num_text)
char_num_text = ''
each_row.append(ref_loc_text)
ref_loc_text = ''
each_row.append(charateristic_designator_text)
charateristic_designator_text = ''
each_row.append(' '.join(list(dict.fromkeys(requirement))))
each_row.append(' '.join(results))
all_rows.append(each_row)
each_row = []
requirement_for_each_line.append(requirement)
requirement = []
results = []
if column_separator < 0 and 100 > current_line_left > 70:
line_num.append(current_line_number)
char_num.append(text)
char_num_text = text
if 210 > current_line_left >= 190:
ref_loc.append(text)
ref_loc_text = text
if 380 > current_line_left > 370 and len(text) == 5:
charateristic_designator.append(text)
Minor_flag = True
charateristic_designator_text = text
if Minor_flag == False and charateristic_designator_text == '':
charateristic_designator.append('Minor')
charateristic_designator_text = 'Minor'
if 1090 > current_line_left >= 500:# and 98 > column_separator > 0:
requirement.append(text)
if 1250 > current_line_left > 1100:# and column_separator > 100:
results.append(text)
resultText = text
result_flag = False
table = tabulate(all_rows, headers=["5. Char No", "6. Reference", "7. Characteristic", "8. Requirement", "9.Results"])
print(table)
headers=["5. Char No", "6. Reference Location", "7. Characteristic Recoginition", "8. Requirement", "9.Results"]
with open('test_file.csv', 'w', newline = '') as csvfile:
writer = csv.writer(csvfile)
writer.writerow(headers)
[writer.writerow(r) for r in all_rows]
該代碼非常復雜,因為如果基本上從tesseract中提取了文本,這就是為什么它很復雜,但是我得到的輸出是一個巨大的表,需求列也由文本組成,但是我只在有數字條件的情況下關注。 正如您在代碼末尾看到的那樣,我已將輸出打印在.csv excel文件中,現在最重要的是我需要弄清楚在圖中得到范圍值的邏輯。
df = pd.DataFrame({
'8. Requirement': ['.685-.695', '.340-.350', '.737-.740', 'foo', '42'],
'9.Results': [.68, .345, '.739', '.68', 'bar']
})
# or df = pd.read_csv('filename.csv', sep='\t')
df = df.join(df['8. Requirement'].str.extract('(\d*\.?\d+)-(\d*\.?\d+)').rename(columns={0:'min', 1:'max'}))
df['OK'] = pd.to_numeric(df['9.Results'], errors='coerce').between(df['min'].astype(float), df['max'].astype(float))
print(df)
輸出:
8. Requirement 9.Results min max OK
0 .685-.695 0.68 .685 .695 False
1 .340-.350 0.345 .340 .350 True
2 .737-.740 .739 .737 .740 True
3 foo .68 NaN NaN False
4 42 bar NaN NaN False
另一個解決方案:
data = """
9.results,8. Requirement
.68,.13-.70
.34,.45-.939
.74,.45-.987
.68,.13-.67
.34,.25-.939
.74,.95-.987
"""
df = pd.read_csv(pd.compat.StringIO(data), sep=',')
df[['low', 'high']] = df['8. Requirement'].str.split('-', expand=True)
df['Status'] = df['9.results'].between(df['low'].astype(float), df['high'].astype(float))
df.drop(['low','high'], axis=1,inplace=True)
print(df)
9.results 8. Requirement Status
0 0.68 .13-.70 True
1 0.34 .45-.939 False
2 0.74 .45-.987 True
3 0.68 .13-.67 False
4 0.34 .25-.939 True
5 0.74 .95-.987 False
文件的語法:
df = pd.read_csv('test1.csv',sep='\t')# \t if your delimiter is tab
這是一種類型的操作系統解決方案,希望它回答您的問題。 我將范圍列分為兩部分,以簡化計算。
import pandas as pd
data = [[float(0.685),float(0.695),float(0.68)],[float(0.340),float(0.350),float(0.345)],[float(0.737),float(0.740),float(0.736)]]
df = pd.DataFrame(data,columns=['Requirement1','Requirement2','Results'])
print(df)
Requirement1 Requirement2 Results
0 0.685 0.695 0.680
1 0.340 0.350 0.345
2 0.737 0.740 0.736
### If between the range
f = df['Results'].between(df['Requirement1'], df['Requirement2'], inclusive=False)
df.insert(3,'Status',f)
Requirement1 Requirement2 Results Status
0 0.685 0.695 0.680 False
1 0.340 0.350 0.345 True
2 0.737 0.740 0.736 False
### Changing Bool into custom text
mask = df.applymap(type) != bool
d = {True: 'OK', False: 'Not OK'}
df = df.where(mask, df.replace(d))
list_col=['Results']
r = df.drop(list_col,axis=1)
r
r.to_csv(filename,mode = 'w', index=False)
Requirement1 Requirement2 Status
0 0.685 0.695 Not OK
1 0.340 0.350 OK
2 0.737 0.740 Not OK
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.