簡體   English   中英

如何處理 awk 文件比較中的空格

[英]How to handle spaces in awk file comparison

我有兩組 awk 腳本的輸入。 文件 1 和文件 2 相同。 同樣,file3 和 File4 是相同的。 但是當我使用 awk 進行比較時,文件 3 和文件 4 的代碼無法正常工作。

代碼:

BEGIN { FS="[= ]" }
{
    match(" "$0,/ issuer_grid_id="[^"]+"/)
    key = substr($0,RSTART,RLENGTH)
    match(" "$0,/ match_key="[^"]+"/)
    key = key substr($0,RSTART,RLENGTH)
}
NR==FNR {
    file1[key] = $0
    next
}
{
    if ( key in file1 ) {
        nf = split(file1[key],tmp)
        for (i=1; i<nf; i+=2) {
            f1[key,tmp[i]] = tmp[i+1]
        }

        msg = sep = ""
        for (i=1; i<NF; i+=2) {
            if ( $(i+1) != f1[key,$i] ) {
                msg = msg sep OFS ARGV[1] "." $i "=" f1[key,$i] OFS FILENAME "." $i "=" $(i+1)
                sep = ","
            }
        }
        if ( msg != "" ) {
            print "Mismatch for key " key msg
        }
        delete file1[key]
    }
    else {
        file2[key] = $0
    }
}
END {
    for (key in file1) {
        print "In file1 only:", key, file1[key]
    }
    for (key in file2) {
        print "In file2 only:", key, file2[key]
    }
}

文件 1

period="2021-02-28" book_base_ent_cd="U0028" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="USD" issuer_grid_id="2" match_key="PLCHS252SA20"

文件2

period="2021-02-28" book_base_ent_cd="U0028" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" unit_measure="USD" issuer_grid_id="2" match_key="PLCHS252SA20"

文件3:

period="2021-02-28" book_base_ent_cd="U0027" other_inst_ident="PLCHS258Q463" rep_nom_curr="PLN" reporting_basis="Unit" src_instr_class="Debt" mat_date="2026-08-25" nom_curr="PLN" primary_asset_class="Bond" seniority_type="931" security_status="alive" issuer_name="CUST38677608" intra_group_prud_scope="Issuer is not part of the reporting group" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_frbrnc_stts="NOFRBRNRNGT" prfrmng_stts="Performing" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" src_imprmnt_assssmnt_mthd="COLLECTIVE" accmltd_imprmnt="78.54" accmltd_chngs_fv_cr="0" expsr_vl="0" unit_measure="EUR" unit_measure_nv="EUR" crryng_amnt="24565.13" issuer_grid_id="38677608" match_key="PLCHS258Q463"

文件4:

period="2021-02-28" book_base_ent_cd="U0027" other_inst_ident="PLCHS258Q463" rep_nom_curr="PLN" reporting_basis="Unit" src_instr_class="Debt" mat_date="2026-08-25" nom_curr="PLN" primary_asset_class="Bond" seniority_type="931" security_status="alive" issuer_name="CUST38677608" intra_group_prud_scope="Issuer is not part of the reporting group" intra_group_acc_scope="Issuer is not part of the reporting group" frbrnc_stts="Not forborne or renegotiated" src_frbrnc_stts="NOFRBRNRNGT" prfrmng_stts="Performing" src_prfrmng_stts="KC10.1" dflt_stts_issr="Not in default" src_dflt_stts_issr="KC10.1" dflt_stts_instrmnt="Not in default" src_mes_accntng_clssfctn="AMC" prdntl_prtfl="Non-trading book" imprmnt_stts="Stage 1 (IFRS)" src_imprmnt_stts="1" imprmnt_assssmnt_mthd="Collectively assessed" src_imprmnt_assssmnt_mthd="COLLECTIVE" accmltd_imprmnt="78.54" accmltd_chngs_fv_cr="0" expsr_vl="0" unit_measure="EUR" unit_measure_nv="EUR" crryng_amnt="24565.13" issuer_grid_id="38677608" match_key="PLCHS258Q463"

Output 用於 File1 和 File2

 awk -f compare.awk file1 file2
$

Output 用於 File3 和 File4

awk -f compare.awk file3 file4
Mismatch for key issuer_grid_id="38677608" match_key="PLCHS258Q463"  file1."KC10.1"=dflt_stts_instrmnt file2."KC10.1"=dflt_stts_issr, file1.default"=src_mes_accntng_clssfctn file2.default"=src_dflt_stts_issr, file1."0"=unit_measure file2."0"=expsr_vl, file1."EUR"=crryng_amnt file2."EUR"=unit_measure_nv

第二組(file3 和 file4)中的輸入有何不同,以及如何在引用的輸入字符串中處理空格?

謝謝並恭祝安康 !!!

為了幫助您完成此任務以及您希望對數據執行的任何未來任務 - 使用任何 awk,以下是如何在所有輸入文件中識別您的標簽-值對並為它們創建鍵值:

$ cat tst.awk
{
    numTags = mkTag2val($0,tag2val)
    key = tag2val["issuer_grid_id"] RS tag2val["match_key"]
    print "----" ORS FILENAME, "<" key ">", numTags
    for (tag in tag2val) {
        printf "\t%s=\"%s\"\n", tag, tag2val[tag]
    }
}

function mkTag2val(str,tag2val,         tag,val,cnt) {
    delete tag2val
    while ( match(str,/[^ ]+="[^"]*"/) ) {
        ++cnt
        tag = val = substr(str,RSTART,RLENGTH)
        sub(/=.*/,"",tag)
        gsub(/^[^=]+="|"$/,"",val)
        tag2val[tag] = val
        str = substr(str,RSTART+RLENGTH)
    }
    return cnt+0
}

$ awk -f tst.awk file{1..4}
----
file1 <2
PLCHS252SA20> 16
        dflt_stts_instrmnt="Not in default"
        period="2021-02-28"
        imprmnt_stts="Stage 1 (IFRS)"
        imprmnt_assssmnt_mthd="Collectively assessed"
        issuer_grid_id="2"
        intra_group_acc_scope="Issuer is not part of the reporting group"
        prdntl_prtfl="Non-trading book"
        dflt_stts_issr="Not in default"
        src_prfrmng_stts="KC10.1"
        src_mes_accntng_clssfctn="AMC"
        src_imprmnt_stts="1"
        src_dflt_stts_issr="KC10.1"
        book_base_ent_cd="U0028"
        unit_measure="USD"
        frbrnc_stts="Not forborne or renegotiated"
        match_key="PLCHS252SA20"
----
file2 <2
PLCHS252SA20> 16
        dflt_stts_instrmnt="Not in default"
        period="2021-02-28"
        imprmnt_stts="Stage 1 (IFRS)"
        imprmnt_assssmnt_mthd="Collectively assessed"
        issuer_grid_id="2"
        intra_group_acc_scope="Issuer is not part of the reporting group"
        prdntl_prtfl="Non-trading book"
        dflt_stts_issr="Not in default"
        src_prfrmng_stts="KC10.1"
        src_mes_accntng_clssfctn="AMC"
        src_imprmnt_stts="1"
        src_dflt_stts_issr="KC10.1"
        book_base_ent_cd="U0028"
        unit_measure="USD"
        frbrnc_stts="Not forborne or renegotiated"
        match_key="PLCHS252SA20"
----
file3 <38677608
PLCHS258Q463> 35
        expsr_vl="0"
        dflt_stts_instrmnt="Not in default"
        src_frbrnc_stts="NOFRBRNRNGT"
        src_instr_class="Debt"
        other_inst_ident="PLCHS258Q463"
        accmltd_chngs_fv_cr="0"
        nom_curr="PLN"
        period="2021-02-28"
        prfrmng_stts="Performing"
        imprmnt_stts="Stage 1 (IFRS)"
        seniority_type="931"
        accmltd_imprmnt="78.54"
        imprmnt_assssmnt_mthd="Collectively assessed"
        issuer_grid_id="38677608"
        intra_group_acc_scope="Issuer is not part of the reporting group"
        src_imprmnt_assssmnt_mthd="COLLECTIVE"
        prdntl_prtfl="Non-trading book"
        dflt_stts_issr="Not in default"
        src_prfrmng_stts="KC10.1"
        rep_nom_curr="PLN"
        src_mes_accntng_clssfctn="AMC"
        security_status="alive"
        mat_date="2026-08-25"
        crryng_amnt="24565.13"
        unit_measure_nv="EUR"
        src_imprmnt_stts="1"
        src_dflt_stts_issr="KC10.1"
        issuer_name="CUST38677608"
        primary_asset_class="Bond"
        book_base_ent_cd="U0027"
        unit_measure="EUR"
        frbrnc_stts="Not forborne or renegotiated"
        intra_group_prud_scope="Issuer is not part of the reporting group"
        reporting_basis="Unit"
        match_key="PLCHS258Q463"
----
file4 <38677608
PLCHS258Q463> 35
        expsr_vl="0"
        dflt_stts_instrmnt="Not in default"
        src_frbrnc_stts="NOFRBRNRNGT"
        src_instr_class="Debt"
        other_inst_ident="PLCHS258Q463"
        accmltd_chngs_fv_cr="0"
        nom_curr="PLN"
        period="2021-02-28"
        prfrmng_stts="Performing"
        imprmnt_stts="Stage 1 (IFRS)"
        seniority_type="931"
        accmltd_imprmnt="78.54"
        imprmnt_assssmnt_mthd="Collectively assessed"
        issuer_grid_id="38677608"
        intra_group_acc_scope="Issuer is not part of the reporting group"
        src_imprmnt_assssmnt_mthd="COLLECTIVE"
        prdntl_prtfl="Non-trading book"
        dflt_stts_issr="Not in default"
        src_prfrmng_stts="KC10.1"
        rep_nom_curr="PLN"
        src_mes_accntng_clssfctn="AMC"
        security_status="alive"
        mat_date="2026-08-25"
        crryng_amnt="24565.13"
        unit_measure_nv="EUR"
        src_imprmnt_stts="1"
        src_dflt_stts_issr="KC10.1"
        issuer_name="CUST38677608"
        primary_asset_class="Bond"
        book_base_ent_cd="U0027"
        unit_measure="EUR"
        frbrnc_stts="Not forborne or renegotiated"
        intra_group_prud_scope="Issuer is not part of the reporting group"
        reporting_basis="Unit"
        match_key="PLCHS258Q463"

將該技術應用於您現有的代碼,然后在您有任何問題時提出一個新問題來顯示您修改后的代碼。

我假設當前代碼的邏輯是有效的,這里真正的問題是如何根據兩個不同的字段定義解析數據,所以 fwiw...

假設:

  • 需要根據string before an equal signeverything inside a pair of double quotes拆分輸入記錄
  • 雙引號僅用作分隔符(即雙引號不顯示為轉義字符)
  • 雙引號內不顯示等號

對 OP 當前代碼的一些小改動:

  • 我們將使用自定義FPAT (字段模式)定義實際字段的結構,而不是FS (字段分隔符)
  • 此更改( FS => FPAT )要求我們將split() (基於FS )調用替換為patsplit() (基於FPAT )調用(參見GNU awk 字符串函數

我們的FPAT定義:

  • FPAT="(\\<[^=]+)|(\"[^\"]+\")"
  • 第一個字段模式定義: (\\<[^=]+)
  • \<指定左側單詞邊界
  • [^=]+是所有不是等號的東西
  • 第二個字段模式定義: (\"[^\"]+\")"
  • 以雙引號 ( \" ) 開頭,后跟一個或多個非等號字符 ( [^\"]+ ),后跟雙引號 ( \" )

注意: FPAT\< 字邊界需要GNU awk (至少version 4.0

對 OP 的當前代碼進行以下兩項更改:

awk '
BEGIN { FPAT="(\\<[^=]+)|(\"[^\"]+\")" }            ### replace FS with FPAT
{
    match(" "$0,/ issuer_grid_id="[^"]+"/)
    key = substr($0,RSTART,RLENGTH)
    match(" "$0,/ match_key="[^"]+"/)
    key = key substr($0,RSTART,RLENGTH)
}
NR==FNR {
    file1[key] = $0
    next
}
{
    if ( key in file1 ) {
        nf = patsplit(file1[key],tmp)               ### replace split() with patsplit()
        for (i=1; i<nf; i+=2) {
            f1[key,tmp[i]] = tmp[i+1]
        }

        msg = sep = ""
        for (i=1; i<NF; i+=2) {
            if ( $(i+1) != f1[key,$i] ) {
                msg = msg sep OFS ARGV[1] "." $i "=" f1[key,$i] OFS FILENAME "." $i "=" $(i+1)
                sep = ","
            }
        }
        if ( msg != "" ) {
            print "Mismatch for key " key msg
        }
        delete file1[key]
    }
    else {
        file2[key] = $0
    }
}
END {
    for (key in file1) {
        print "In file1 only:", key, file1[key]
    }
    for (key in file2) {
        print "In file2 only:", key, file2[key]
    }
}
'

對 OP 的當前file3file4運行此命令不會生成任何內容,因為 output:

$ awk -f compare.awk file1 file2
$
$ awk -f compare.awk file3 file4
$

添加 3 個新文件:

  • file5 - 與file4相同,除了period="2021-02-29"
  • file6 - 與file4相同,除了book_base_ent_cd="U0030"
  • file7 - 與file4相同,除了issuer_grid_id="X8677608" (即,我們正在修改密鑰)

針對這些附加文件運行新代碼:

$ awk -f compare.awk file4 file5
Mismatch for key issuer_grid_id="38677608" match_key="PLCHS258Q463" file4.period="2021-02-28" file5.period="2021-02-29"

$ awk -f compare.awk file4 file6
Mismatch for key issuer_grid_id="38677608" match_key="PLCHS258Q463" file4.book_base_ent_cd="U0027" file6.book_base_ent_cd="U0030"

$ awk -f compare.awk file4 file7
In file1 only: ...
In file2 only: ...

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM