简体   繁体   中英

How to rearrange the columns using awk?

I have a file with 120 columns. A part of it is here with 12 columns.

A1      B1     C1      D1       A2      B2     C2      D2       A3      B3      C3      D3     
4       4       5       2       3       3       2       1       9       17      25      33
5       6       4       6       8       2       3       5       3       1       -1      -3
7       8       3       10      13      1       4       9       -3      -15     -27     -39
9       10      2       14      18      0       5       13      -9      -31     -53     -75
11      12      1       18      23      -1      6       17      -15     -47     -79     -111
13      14      0       22      28      -2      7       21      -21     -63     -105    -147
15      16      -1      26      33      -3      8       25      -27     -79     -131    -183
17      18      -2      30      38      -4      9       29      -33     -95     -157    -219
19      20      -3      34      43      -5      10      33      -39     -111    -183    -255
21      22      -4      38      48      -6      11      37      -45     -127    -209    -291

I would like to rearrange it by bringing all A columns together (A1 A2 A3 A4) and similarly all Bs (B1 B2 B3 B4), Cs (C1 C2 C3 C4), Ds (D1 D2 D3 D4) together.

I am looking to print the columns as

A1 A2 A3 A4 B1 B2 B3 B4 C1 C2 C3 C4 D1 D2 D3 D4
 

My script is:

#!/bin/sh
sed -i '1d' input.txt
for i in {1..4};do
    j=$(( 1 + $(( 3 * $((  i - 1 )) ))  ))
awk '{print $'$j'}' input.txt >> output.txt
done
for i in {1..4};do
    j=$(( 2 + $(( 3 * $((  i - 1 )) ))  ))
awk '{print $'$j'}' input.txt >> output.txt
done
for i in {1..4};do
    j=$(( 3 + $(( 3 * $((  i - 1 )) ))  ))
awk '{print $'$j'}' input.txt >> output.txt
done

It is printing all in one column.

Is it just A,B,C,D,A,B,C,D all the way across? Something like this should work:

awk '{
    for (i=0; i<4; ++i) {  # i=0:A, i=1:B,etc.
       for (j=0; 4*j+i<NF; ++j) {
         printf "%s%s", $(4*j+i+1), OFS;
       }
    }
    print ""
}'

Here are two Generic approach solutions, without hard-coding the field numbers from Input_file, values can come in any order and it will sort them automatically. Written and tested in GNU awk with shown samples.

1st solution: Traverse through all the lines and their respective fields and then sort by values to perform indexing on headers.

awk '
FNR==1{
  for(i=1;i<=NF;i++){
     arrInd[i]=$i
  }
  next
}
{
  for(i=1;i<=NF;i++){
     value[FNR,arrInd[i]]=$i
  }
}
END{
  PROCINFO["sorted_in"]="@val_num_asc"
  for(i in arrInd){
     printf("%s%s",arrInd[i],i==length(arrInd)?ORS:OFS)
  }
  for(i=2;i<=FNR;i++){
     for(k in arrInd){
        printf("%s%s",value[i,arrInd[k]],k==length(arrInd)?ORS:OFS)
     }
  }
}
'   Input_file

OR in case you want to get output in tabular format, then small tweak in above solution.

awk '
BEGIN { OFS="\t" }
FNR==1{
  for(i=1;i<=NF;i++){
    arrInd[i]=$i
  }
  next
}
{
  for(i=1;i<=NF;i++){
    value[FNR,arrInd[i]]=$i
  }
}
END{
  PROCINFO["sorted_in"]="@val_num_asc"
  for(i in arrInd){
    printf("%s%s",arrInd[i],i==length(arrInd)?ORS:OFS)
  }
  for(i=2;i<=FNR;i++){
    for(k in arrInd){
       printf("%s%s",value[i,arrInd[k]],k==length(arrInd)?ORS:OFS)
    }
  }
}
' Input_file | column -t -s $'\t'


2nd solution: Almost same concept of 1st solution, here traversing through array within conditions rather than explicitly calling it in END block of this program.

awk '
BEGIN { OFS="\t" }
FNR==1{
  for(i=1;i<=NF;i++){
    arrInd[i]=$i
  }
  next
}
{
  for(i=1;i<=NF;i++){
    value[FNR,arrInd[i]]=$i
  }
}
END{
  PROCINFO["sorted_in"]="@val_num_asc"
  for(i=1;i<=FNR;i++){
    if(i==1){
       for(k in arrInd){
          printf("%s%s",arrInd[k],k==length(arrInd)?ORS:OFS)
       }
    }
    else{
       for(k in arrInd){
          printf("%s%s",value[i,arrInd[k]],k==length(arrInd)?ORS:OFS)
       }
    }
  }
}
' Input_file | column -t -s $'\t'

A similar approach to @MarkReed that manipulates the increment instead of the test condition can be written as:

awk '{
  for (n=1; n<=4; n++)
    for (c=n; c<=NF; c+=4)
      printf "%s%s", ((c>1)?"\t":""), $c
    print ""
  }
' cols.txt

Example Use/Output

With your sample input in cols.txt you would have:

$ awk '{
>   for (n=1; n<=4; n++)
>     for (c=n; c<=NF; c+=4)
>       printf "%s%s", ((c>1)?"\t":""), $c
>     print ""
>   }
> ' cols.txt
A1      A2      A3      B1      B2      B3      C1      C2      C3      D1      D2      D3
4       3       9       4       3       17      5       2       25      2       1       33
5       8       3       6       2       1       4       3       -1      6       5       -3
7       13      -3      8       1       -15     3       4       -27     10      9       -39
9       18      -9      10      0       -31     2       5       -53     14      13      -75
11      23      -15     12      -1      -47     1       6       -79     18      17      -111
13      28      -21     14      -2      -63     0       7       -105    22      21      -147
15      33      -27     16      -3      -79     -1      8       -131    26      25      -183
17      38      -33     18      -4      -95     -2      9       -157    30      29      -219
19      43      -39     20      -5      -111    -3      10      -183    34      33      -255
21      48      -45     22      -6      -127    -4      11      -209    38      37      -291

Here's a succinct generic solution that is not memory-bound, as RavinderSing13's solution is. (That is, it does not store the entire input in an array for printing in END.)


BEGIN {
   OFS="\t" # output field separator
}

NR==1 {
   # Sort column titles
   for (i=1;i<=NF;i++) { sorted[i]=$i; position[$i]=i }
   asort(sorted)
   # And print them
   for (i=1;i<=NF;i++) { $i=sorted[i] }
   print
   next
}

{
   # Make an array of our input line...
   split($0,line)
   for (i=1;i<=NF;i++) { $i=line[position[sorted[i]]] }
   print
}

The idea here is that at the first line of input, we record the position of our columns in the input, then sort the list of column names with asort() . It is important here that column names are not duplicated, as they are used as the index of an array.

As we step through the data, each line is reordered by replacing each field with the value from the position as sorted by the first line.

It is important that you set your input field separator correctly (whitespace, tab, comma, whatever), and have the complete set of fields in each line, or output will be garbled.

Also, this doesn't create columns. You mentioned A4 in your question, but there is no A4 in your sample data. We are only sorting what is there.

Lastly, this is a GNU awk program, due to the use of asort() .

Using any awk for any number of tags (non-numeric leading strings in the header line) and/or numbers associated with them in the header line, including different counts of each letter so you could have A1 A2 but then B1 B2 B3 B4, reproducing the input order in the output and only storing 1 line at a time in memory:

$ cat tst.awk
BEGIN { OFS="\t" }
NR == 1 {
    for ( fldNr=1; fldNr<=NF; fldNr++ ) {
        tag = $fldNr
        sub(/[0-9]+$/,"",tag)
        if ( !seen[tag]++ ) {
            tags[++numTags] = tag
        }
        fldNrs[tag,++numTagCols[tag]] = fldNr
    }
}
{
    out = ""
    for ( tagNr=1; tagNr<=numTags; tagNr++ ) {
        tag = tags[tagNr]
        for ( tagColNr=1; tagColNr<=numTagCols[tag]; tagColNr++ ) {
            fldNr = fldNrs[tag,tagColNr]
            out = (out=="" ? "" : out OFS) $fldNr
        }
    }
    print out
}

$ awk -f tst.awk file
A1      A2      A3      B1      B2      B3      C1      C2      C3      D1      D2      D3
4       3       9       4       3       17      5       2       25      2       1       33
5       8       3       6       2       1       4       3       -1      6       5       -3
7       13      -3      8       1       -15     3       4       -27     10      9       -39
9       18      -9      10      0       -31     2       5       -53     14      13      -75
11      23      -15     12      -1      -47     1       6       -79     18      17      -111
13      28      -21     14      -2      -63     0       7       -105    22      21      -147
15      33      -27     16      -3      -79     -1      8       -131    26      25      -183
17      38      -33     18      -4      -95     -2      9       -157    30      29      -219
19      43      -39     20      -5      -111    -3      10      -183    34      33      -255
21      48      -45     22      -6      -127    -4      11      -209    38      37      -291

or with different formats of tags and different numbers of columns per tag:

$ cat file
foo1    bar1    bar2    bar3    foo2    bar4
4       4       5       2       3       3
5       6       4       6       8       2

$ awk -f tst.awk file
foo1    foo2    bar1    bar2    bar3    bar4
4       3       4       5       2       3
5       8       6       4       6       2

The above assumes you want the output order per tag to match the input order, not be based on the numeric values after each tag so if you have input of A2 B1 A1 then the output will be A2 A1 B1 , not A1 A2 B1 .

The technical post webpages of this site follow the CC BY-SA 4.0 protocol. If you need to reprint, please indicate the site URL or the original address.Any question please contact:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM