[英]How to Pivot Data Using AWK
从:
DT X Y Z
10 75 0 3
20 100 1 6
30 125 2 9
至:
DT ID VALUE
10 X 75
20 Y 0
30 Z 3
10 X 100
20 Y 1
30 Z 6
10 X 125
20 Y 2
30 Z 9
完成
#my original dataset is separated by "," and have 280 cols
tempfile=dataset.csv;
col_count=`head -n1 $tempfile | tr -cd "," | wc -c`;
col_count=`expr $col_count + 1`;
for i in `seq 4 $col_count`; do
echo $i;
pt="{print \$"$i"}";
col_name=`head -n 1 $tempfile | sed s'/ //'g | awk -F"," "$pt"`;
awk -F"," -v header="DT,ID,$col_name" -f st.awk $tempfile | awk 'NR>1 {print substr($0,index($0,$1))",'"$col_name"'"}' | sed 's/ //g' >> New$tempfile;
done;
# file st.awk:
# the code below was found on some stackoverflow page, with some minor changes
BEGIN {
# Parse headers into an assoc array h
split(header, a, ",")
for(i in a) {
h[a[i]]=2
}
}
# Find the column numbers in the first line of a file
FNR==1{
split("", cols) # This will re-init cols
for(i=1;i<=NF;i++) {
if($i in h) {
cols[i]=1
}
}
next
}
# Print those columns on all other lines
{
res = ""
for(i=1;i<=NF;i++) {
if(i in cols) {
s = res ? OFS : ""
res = res "," $i
}
}
if (res) {
print res
}
}
你可以试试这个 awk(MAWK 版本 1.2)
您的数据可以是 5x5 或更多
mawk -v OFS='\t' '
NR==1 {
nbfield=(NF-1)
for(i=1;i<NF;i++)
ID[i]=$(i+1)
print $1 OFS "ID" OFS "VALUE"
next
}
{
numrecord=((NR-1)%nbfield)
numrecord = numrecord ? numrecord : nbfield
for(i=0;i<=nbfield;i++)
val[ID[i],numrecord]=$(i+1)
}
numrecord==nbfield {
for(i=1;i<=nbfield;i++)
for(j=1;j<=nbfield;j++)
print val[ID[0],j] OFS ID[j] OFS val[ID[j],i]
}
' infile
输入:
-- ColOne ColTwo ColThr
RowOne A B C D E
RowTwo F G H I J
RowThr K L M N O
RowFor P Q R S T
RowFiv U V W X Y
输出:
RowNbr | ColNbr | RowColVal
------ | ------ | ---------
RowOne | ColOne | A
RowOne | ColTwo | B
RowOne | ColThr | C
RowTwo | ColOne | F
RowTwo | ColTwo | G
RowTwo | ColThr | H
RowThr | ColOne | K
RowThr | ColTwo | L
RowThr | ColThr | M
枢轴脚本:
# pivot a table
BEGIN { # before processing innput lines, emit output header
OFS = " | " # set the output field-separator
fmtOutDtl = "%6s | %-6s | %-9s" "\n" # set the output format for all detail lines: InpRowHdr, InpColHdr, InpVal
fmtOutHdr = "%6s | ColNbr | RowColVal" "\n" # set the output format for the header line
strOutDiv = "------ | ------ | ---------" # set the divider line
print "" # emit blank line before output
} # done with output header
NR == 1 { # when we are on the innput header line / the first row
FldCnt = ( NF - 1 ) # number of columns to process is number of fields on this row, except for the first val
for( idxCol = 1; idxCol < NF; idxCol++ ) # scan col numbers after the first, ignoring the first val
ColHds[ idxCol ] = $( idxCol + 1 ) # store the next col-val as this ColHdr
printf( fmtOutHdr, "RowNbr" ) # emit header line: RowNbr-header, innput column headers
print strOutDiv # emit divider row after header line and before data lines
next # skip to the next innput row
} # done with first innput row
{ # for each body innput row
RecNbr = ( ( NR - 1 ) % FldCnt ) # get RecNum for this row: ( RecNum - 1 ) Mod [number of fields]: zero-based / 0..[number_of_cols-1]
RecNbr = RecNbr ? RecNbr : FldCnt # promote from zero-based to one-based: 0 => [number of fields]: one -based / 1..[number_of_cols ]
for( idxCol = 0; idxCol <= FldCnt; idxCol++ ) # scan col numbers including the first
Rws[ ColHds[ idxCol ], RecNbr ] = $( idxCol + 1 ) # store this row+col val in this Row position under this ColHdr
} # done with this body innput row
RecNbr == FldCnt { # when we are on the last innput row that we are processing (lines beyond FldCnt are not emitted)
for( idxCol = 1; idxCol <= FldCnt; idxCol++ ) { # scan col numbers after the first
for( idxRow = 1; idxRow <= FldCnt; idxRow++ ) { # scan row numbers after the first, up to number of cols
printf( fmtOutDtl \
,Rws[ ColHds[ 0 ] , idxCol ] \
, ColHds[ idxRow ] \
,Rws[ ColHds[ idxRow ] , idxCol ] ) # emit innput rowHdr, colHdr, row+col val
} # done scanning row numbers
print "" # emit a blank line after each innput row
} # done scanning col numbers
} # done with the last innput row
END { # after processing innput lines
} # do nothing
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.