繁体   English   中英

如何使用 AWK 透视数据

[英]How to Pivot Data Using AWK

从:

DT  X   Y   Z
10  75  0   3
20  100 1   6
30  125 2   9

至:

DT  ID  VALUE
10  X   75
20  Y   0
30  Z   3
10  X   100
20  Y   1
30  Z   6
10  X   125
20  Y   2
30  Z   9

完成

#my original dataset is separated by "," and have 280 cols 

        tempfile=dataset.csv;
        col_count=`head -n1 $tempfile | tr -cd "," | wc -c`;
        col_count=`expr $col_count + 1`;


        for i in `seq 4 $col_count`; do 
                                    echo $i;
                        pt="{print \$"$i"}";
                    col_name=`head -n 1 $tempfile | sed s'/ //'g | awk -F"," "$pt"`;
                    awk -F"," -v header="DT,ID,$col_name" -f st.awk $tempfile | awk 'NR>1 {print substr($0,index($0,$1))",'"$col_name"'"}' | sed 's/ //g' >> New$tempfile;

             done;

  # file st.awk:
# the code below was found on some stackoverflow page, with some minor changes
    BEGIN {
        # Parse headers into an assoc array h
        split(header, a, ",")
        for(i in a) {
            h[a[i]]=2
        }   

    }

    # Find the column numbers in the first line of a file
    FNR==1{
        split("", cols) # This will re-init cols
        for(i=1;i<=NF;i++) {
            if($i in h) {
                cols[i]=1
            }
        }   
        next
    }

    # Print those columns on all other lines
    {
        res = ""
        for(i=1;i<=NF;i++) {
            if(i in cols) {
                s = res ? OFS : ""
                res = res "," $i
            }
        }   
        if (res) {
            print res 
        }   
    }

你可以试试这个 awk(MAWK 版本 1.2)

您的数据可以是 5x5 或更多

mawk -v OFS='\t' '
NR==1 {
nbfield=(NF-1)
for(i=1;i<NF;i++)
    ID[i]=$(i+1)
print $1 OFS "ID" OFS "VALUE"
next
}
{
numrecord=((NR-1)%nbfield)
numrecord = numrecord ? numrecord : nbfield
for(i=0;i<=nbfield;i++)
    val[ID[i],numrecord]=$(i+1)
}
numrecord==nbfield {
for(i=1;i<=nbfield;i++)
    for(j=1;j<=nbfield;j++) 
        print val[ID[0],j] OFS ID[j] OFS val[ID[j],i]
}
' infile

输入:

--     ColOne ColTwo ColThr
RowOne    A      B      C      D      E
RowTwo    F      G      H      I      J
RowThr    K      L      M      N      O
RowFor    P      Q      R      S      T
RowFiv    U      V      W      X      Y

输出:

RowNbr | ColNbr | RowColVal
------ | ------ | ---------
RowOne | ColOne | A        
RowOne | ColTwo | B        
RowOne | ColThr | C        

RowTwo | ColOne | F        
RowTwo | ColTwo | G        
RowTwo | ColThr | H        

RowThr | ColOne | K        
RowThr | ColTwo | L        
RowThr | ColThr | M        

枢轴脚本:

# pivot a table

BEGIN                                 {                          # before processing innput lines, emit output header
  OFS = " | "                                                    # set the output field-separator
  fmtOutDtl = "%6s | %-6s | %-9s"          "\n"                  # set the output format for all detail  lines: InpRowHdr, InpColHdr, InpVal
  fmtOutHdr = "%6s | ColNbr | RowColVal"   "\n"                  # set the output format for the header  line
  strOutDiv =  "------ | ------ | ---------"                     # set                       the divider line
  print ""                                                       # emit blank line before output
}                                                                # done with output header

NR == 1                               {                          # when we are on the innput header line /                                  the first row
  FldCnt = (                       NF - 1 )                      # number of columns to process is number of fields on this row, except for the first val
  for(      idxCol = 1;   idxCol < NF;            idxCol++   )   # scan col numbers after the first, ignoring                               the first val
                  ColHds[ idxCol ]           = $( idxCol + 1 )   # store the next col-val as this ColHdr

  printf( fmtOutHdr, "RowNbr"  )                                 # emit header line: RowNbr-header, innput column headers
  print   strOutDiv                                              # emit divider row after header line and before data lines
  next                                                           # skip to the next innput row
}                                                                # done with first  innput row

{                                                                # for each body innput row
  RecNbr = ( ( NR - 1 ) %           FldCnt )                     # get RecNum for this  row: ( RecNum - 1 ) Mod [number of fields]: zero-based / 0..[number_of_cols-1]
  RecNbr = RecNbr ? RecNbr :        FldCnt                       # promote from zero-based to one-based: 0 =>   [number of fields]: one -based / 1..[number_of_cols  ]
  for(      idxCol = 0;   idxCol <= FldCnt;       idxCol++   )   # scan col numbers including the first
             Rws[ ColHds[ idxCol ], RecNbr ] = $( idxCol + 1 )   # store this row+col val in this Row position under this ColHdr
}                                                                # done with this body innput row

RecNbr ==                           FldCnt {                     # when we are on the last innput row that we are processing (lines beyond FldCnt are not emitted)
  for(      idxCol = 1;   idxCol <= FldCnt;       idxCol++   ) { # scan col numbers after the first
    for(    idxRow = 1;   idxRow <= FldCnt;       idxRow++   ) { # scan row numbers after the first, up to number of cols
      printf( fmtOutDtl                                          \
            ,Rws[ ColHds[ 0      ]   ,            idxCol ]       \
            ,     ColHds[ idxRow ]                               \
            ,Rws[ ColHds[ idxRow ]   ,            idxCol ]   )   # emit innput rowHdr, colHdr, row+col val
    }                                                            # done scanning row numbers
    print ""                                                     # emit a blank line after each innput row
  }                                                              # done scanning col numbers
}                                                                # done with the last innput row

END                                   {                          # after processing innput lines
}                                                                # do nothing

暂无
暂无

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM