[英]How to handle spaces in awk file comparison
我有兩組 awk 腳本的輸入。 文件 1 和文件 2 相同。 同樣,file3 和 File4 是相同的。 但是當我使用 awk 進行比較時,文件 3 和文件 4 的代碼無法正常工作。
代碼:
BEGIN { FS="[= ]" }
{
match(" "$0,/ issuer_grid_id="[^"]+"/)
key = substr($0,RSTART,RLENGTH)
match(" "$0,/ match_key="[^"]+"/)
key = key substr($0,RSTART,RLENGTH)
}
NR==FNR {
file1[key] = $0
next
}
{
if ( key in file1 ) {
nf = split(file1[key],tmp)
for (i=1; i<nf; i+=2) {
f1[key,tmp[i]] = tmp[i+1]
}
msg = sep = ""
for (i=1; i<NF; i+=2) {
if ( $(i+1) != f1[key,$i] ) {
msg = msg sep OFS ARGV[1] "." $i "=" f1[key,$i] OFS FILENAME "." $i "=" $(i+1)
sep = ","
}
}
if ( msg != "" ) {
print "Mismatch for key " key msg
}
delete file1[key]
}
else {
file2[key] = $0
}
}
END {
for (key in file1) {
print "In file1 only:", key, file1[key]
}
for (key in file2) {
print "In file2 only:", key, file2[key]
}
}
文件 1
period="2021-02-28" book_base_ent_cd="U0028" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="USD" issuer_grid_id="2" match_key="PLCHS252SA20"
文件2
period="2021-02-28" book_base_ent_cd="U0028" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="USD" issuer_grid_id="2" match_key="PLCHS252SA20"
文件3:
period="2021-02-28" book_base_ent_cd="U0027" other_inst_ident="PLCHS258Q463" rep_nom_curr="PLN" reporting_basis="Unit" src_instr_class="Debt" mat_date="2026-08-25" nom_curr="PLN" primary_asset_class="Bond" seniority_type="931" security_status="alive" issuer_name="CUST38677608" intra_group_prud_scope="Issuer is not part of the reporting group" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_frbrnc_stts="NOFRBRNRNGT" prfrmng_stts="Performing" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" src_imprmnt_assssmnt_mthd="COLLECTIVE" accmltd_imprmnt="78.54" accmltd_chngs_fv_cr="0" expsr_vl="0" unit_measure="EUR" unit_measure_nv="EUR" crryng_amnt="24565.13" issuer_grid_id="38677608" match_key="PLCHS258Q463"
文件4:
period="2021-02-28" book_base_ent_cd="U0027" other_inst_ident="PLCHS258Q463" rep_nom_curr="PLN" reporting_basis="Unit" src_instr_class="Debt" mat_date="2026-08-25" nom_curr="PLN" primary_asset_class="Bond" seniority_type="931" security_status="alive" issuer_name="CUST38677608" intra_group_prud_scope="Issuer is not part of the reporting group" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_frbrnc_stts="NOFRBRNRNGT" prfrmng_stts="Performing" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" src_imprmnt_assssmnt_mthd="COLLECTIVE" accmltd_imprmnt="78.54" accmltd_chngs_fv_cr="0" expsr_vl="0" unit_measure="EUR" unit_measure_nv="EUR" crryng_amnt="24565.13" issuer_grid_id="38677608" match_key="PLCHS258Q463"
Output 用於 File1 和 File2
awk -f compare.awk file1 file2
$
Output 用於 File3 和 File4
awk -f compare.awk file3 file4
Mismatch for key issuer_grid_id="38677608" match_key="PLCHS258Q463" file1."KC10.1"=dflt_stts_instrmnt file2."KC10.1"=dflt_stts_issr, file1.default"=src_mes_accntng_clssfctn file2.default"=src_dflt_stts_issr, file1."0"=unit_measure file2."0"=expsr_vl, file1."EUR"=crryng_amnt file2."EUR"=unit_measure_nv
第二組(file3 和 file4)中的輸入有何不同,以及如何在引用的輸入字符串中處理空格?
謝謝並恭祝安康 !!!
為了幫助您完成此任務以及您希望對數據執行的任何未來任務 - 使用任何 awk,以下是如何在所有輸入文件中識別您的標簽-值對並為它們創建鍵值:
$ cat tst.awk
{
numTags = mkTag2val($0,tag2val)
key = tag2val["issuer_grid_id"] RS tag2val["match_key"]
print "----" ORS FILENAME, "<" key ">", numTags
for (tag in tag2val) {
printf "\t%s=\"%s\"\n", tag, tag2val[tag]
}
}
function mkTag2val(str,tag2val, tag,val,cnt) {
delete tag2val
while ( match(str,/[^ ]+="[^"]*"/) ) {
++cnt
tag = val = substr(str,RSTART,RLENGTH)
sub(/=.*/,"",tag)
gsub(/^[^=]+="|"$/,"",val)
tag2val[tag] = val
str = substr(str,RSTART+RLENGTH)
}
return cnt+0
}
$ awk -f tst.awk file{1..4}
----
file1 <2
PLCHS252SA20> 16
dflt_stts_instrmnt="Not in default"
period="2021-02-28"
imprmnt_stts="Stage 1 (IFRS)"
imprmnt_assssmnt_mthd="Collectively assessed"
issuer_grid_id="2"
intra_group_acc_scope="Issuer is not part of the reporting group"
prdntl_prtfl="Non-trading book"
dflt_stts_issr="Not in default"
src_prfrmng_stts="KC10.1"
src_mes_accntng_clssfctn="AMC"
src_imprmnt_stts="1"
src_dflt_stts_issr="KC10.1"
book_base_ent_cd="U0028"
unit_measure="USD"
frbrnc_stts="Not forborne or renegotiated"
match_key="PLCHS252SA20"
----
file2 <2
PLCHS252SA20> 16
dflt_stts_instrmnt="Not in default"
period="2021-02-28"
imprmnt_stts="Stage 1 (IFRS)"
imprmnt_assssmnt_mthd="Collectively assessed"
issuer_grid_id="2"
intra_group_acc_scope="Issuer is not part of the reporting group"
prdntl_prtfl="Non-trading book"
dflt_stts_issr="Not in default"
src_prfrmng_stts="KC10.1"
src_mes_accntng_clssfctn="AMC"
src_imprmnt_stts="1"
src_dflt_stts_issr="KC10.1"
book_base_ent_cd="U0028"
unit_measure="USD"
frbrnc_stts="Not forborne or renegotiated"
match_key="PLCHS252SA20"
----
file3 <38677608
PLCHS258Q463> 35
expsr_vl="0"
dflt_stts_instrmnt="Not in default"
src_frbrnc_stts="NOFRBRNRNGT"
src_instr_class="Debt"
other_inst_ident="PLCHS258Q463"
accmltd_chngs_fv_cr="0"
nom_curr="PLN"
period="2021-02-28"
prfrmng_stts="Performing"
imprmnt_stts="Stage 1 (IFRS)"
seniority_type="931"
accmltd_imprmnt="78.54"
imprmnt_assssmnt_mthd="Collectively assessed"
issuer_grid_id="38677608"
intra_group_acc_scope="Issuer is not part of the reporting group"
src_imprmnt_assssmnt_mthd="COLLECTIVE"
prdntl_prtfl="Non-trading book"
dflt_stts_issr="Not in default"
src_prfrmng_stts="KC10.1"
rep_nom_curr="PLN"
src_mes_accntng_clssfctn="AMC"
security_status="alive"
mat_date="2026-08-25"
crryng_amnt="24565.13"
unit_measure_nv="EUR"
src_imprmnt_stts="1"
src_dflt_stts_issr="KC10.1"
issuer_name="CUST38677608"
primary_asset_class="Bond"
book_base_ent_cd="U0027"
unit_measure="EUR"
frbrnc_stts="Not forborne or renegotiated"
intra_group_prud_scope="Issuer is not part of the reporting group"
reporting_basis="Unit"
match_key="PLCHS258Q463"
----
file4 <38677608
PLCHS258Q463> 35
expsr_vl="0"
dflt_stts_instrmnt="Not in default"
src_frbrnc_stts="NOFRBRNRNGT"
src_instr_class="Debt"
other_inst_ident="PLCHS258Q463"
accmltd_chngs_fv_cr="0"
nom_curr="PLN"
period="2021-02-28"
prfrmng_stts="Performing"
imprmnt_stts="Stage 1 (IFRS)"
seniority_type="931"
accmltd_imprmnt="78.54"
imprmnt_assssmnt_mthd="Collectively assessed"
issuer_grid_id="38677608"
intra_group_acc_scope="Issuer is not part of the reporting group"
src_imprmnt_assssmnt_mthd="COLLECTIVE"
prdntl_prtfl="Non-trading book"
dflt_stts_issr="Not in default"
src_prfrmng_stts="KC10.1"
rep_nom_curr="PLN"
src_mes_accntng_clssfctn="AMC"
security_status="alive"
mat_date="2026-08-25"
crryng_amnt="24565.13"
unit_measure_nv="EUR"
src_imprmnt_stts="1"
src_dflt_stts_issr="KC10.1"
issuer_name="CUST38677608"
primary_asset_class="Bond"
book_base_ent_cd="U0027"
unit_measure="EUR"
frbrnc_stts="Not forborne or renegotiated"
intra_group_prud_scope="Issuer is not part of the reporting group"
reporting_basis="Unit"
match_key="PLCHS258Q463"
將該技術應用於您現有的代碼,然后在您有任何問題時提出一個新問題來顯示您修改后的代碼。
我假設當前代碼的邏輯是有效的,這里真正的問題是如何根據兩個不同的字段定義解析數據,所以 fwiw...
假設:
string before an equal sign
或everything inside a pair of double quotes
拆分輸入記錄對 OP 當前代碼的一些小改動:
FPAT
(字段模式)定義實際字段的結構,而不是FS
(字段分隔符)FS
=> FPAT
)要求我們將split()
(基於FS
)調用替換為patsplit()
(基於FPAT
)調用(參見GNU awk 字符串函數) 我們的FPAT
定義:
FPAT="(\\<[^=]+)|(\"[^\"]+\")"
(\\<[^=]+)
:\<
指定左側單詞邊界[^=]+
是所有不是等號的東西(\"[^\"]+\")"
:\"
) 開頭,后跟一個或多個非等號字符 ( [^\"]+
),后跟雙引號 ( \"
) 注意: FPAT和\< 字邊界需要GNU awk
(至少version 4.0
)
對 OP 的當前代碼進行以下兩項更改:
awk '
BEGIN { FPAT="(\\<[^=]+)|(\"[^\"]+\")" } ### replace FS with FPAT
{
match(" "$0,/ issuer_grid_id="[^"]+"/)
key = substr($0,RSTART,RLENGTH)
match(" "$0,/ match_key="[^"]+"/)
key = key substr($0,RSTART,RLENGTH)
}
NR==FNR {
file1[key] = $0
next
}
{
if ( key in file1 ) {
nf = patsplit(file1[key],tmp) ### replace split() with patsplit()
for (i=1; i<nf; i+=2) {
f1[key,tmp[i]] = tmp[i+1]
}
msg = sep = ""
for (i=1; i<NF; i+=2) {
if ( $(i+1) != f1[key,$i] ) {
msg = msg sep OFS ARGV[1] "." $i "=" f1[key,$i] OFS FILENAME "." $i "=" $(i+1)
sep = ","
}
}
if ( msg != "" ) {
print "Mismatch for key " key msg
}
delete file1[key]
}
else {
file2[key] = $0
}
}
END {
for (key in file1) {
print "In file1 only:", key, file1[key]
}
for (key in file2) {
print "In file2 only:", key, file2[key]
}
}
'
對 OP 的當前file3
和file4
運行此命令不會生成任何內容,因為 output:
$ awk -f compare.awk file1 file2
$
$ awk -f compare.awk file3 file4
$
添加 3 個新文件:
file5
- 與file4
相同,除了period="2021-02-29"
file6
- 與file4
相同,除了book_base_ent_cd="U0030"
file7
- 與file4
相同,除了issuer_grid_id="X8677608"
(即,我們正在修改密鑰)針對這些附加文件運行新代碼:
$ awk -f compare.awk file4 file5
Mismatch for key issuer_grid_id="38677608" match_key="PLCHS258Q463" file4.period="2021-02-28" file5.period="2021-02-29"
$ awk -f compare.awk file4 file6
Mismatch for key issuer_grid_id="38677608" match_key="PLCHS258Q463" file4.book_base_ent_cd="U0027" file6.book_base_ent_cd="U0030"
$ awk -f compare.awk file4 file7
In file1 only: ...
In file2 only: ...
聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.