简体   繁体   English

如何使用 shell 脚本找到具有最大上下文长度的行号?

[英]How to find the line number with maximum context length with shell script?

I have a file with each line containing a string.我有一个文件,每一行都包含一个字符串。

I need to find the largest-N lines.我需要找到最大的 N 行。

I am able to find the lines with largest number of lines.我能够找到行数最多的行。 For example, largest-1 could be found by:例如,largest-1 可以通过以下方式找到:

cat./test.txt | awk '{ print length }' | sort -n | tail -1

Or largest-10 could be found by:或者可以通过以下方式找到 largest-10:

cat./test.txt | awk '{ print length }' | sort -n | tail -10

But I also need to get the line number for those lines side by side using shell script.但我还需要使用 shell 脚本并排获取这些行的行号。

Any help is appreciated.任何帮助表示赞赏。

Some lines from the input file:输入文件中的一些行:

testTweeVaderOverLedenInNLPeriodeGeboorte ( ) { java . util . List < nl . bzk . brp . model . objecttype . operationeel . BetrokkenheidModel > echtgenoten = java . util . Arrays . asList ( maakBetrokkenheden ( 20110701 , 20120101 , ( ( short ) ( 1 ) ) ) , maakBetrokkenheden ( 20120201 , 20120504 , ( ( short ) ( 1 ) ) ) ) ; org . mockito . Mockito . when ( relatieRepository . haalOpBetrokkenhedenVanPersoon ( org . mockito . Matchers . any ( nl . bzk . brp . model . objecttype . operationeel . PersoonModel . class ) , org . mockito . Matchers . any ( nl . bzk . brp . dataaccess . selectie . RelatieSelectieFilter . class ) ) ) . thenReturn ( echtgenoten ) ; org . mockito . Mockito . when ( persoonRepository . haalPersoonOpMetAdresViaBetrokkenheid ( echtgenoten . get ( 0 ) ) ) . thenReturn ( echtgenoten . get ( 0 ) . getBetrokkene ( ) ) ; org . mockito . Mockito . when ( persoonRepository . haalPersoonOpMetAdresViaBetrokkenheid ( echtgenoten . get ( 1 ) ) ) . thenReturn ( echtgenoten . get ( 1 ) . getBetrokkene ( ) ) ; java . util . List < nl . bzk . brp . model . objecttype . operationeel . PersoonModel > kandidaten = kandidaatVader . bepaalKandidatenVader ( new nl . bzk . brp . model . objecttype . operationeel . PersoonModel ( new nl . bzk . brp . model . objecttype . bericht . PersoonBericht ( ) ) , new nl . bzk . brp . model . attribuuttype . Datum ( 20120506 ) ) ; org . mockito . Mockito . verify ( persoonRepository , org . mockito . Mockito . times ( 2 ) ) . haalPersoonOpMetAdresViaBetrokkenheid ( ( ( nl . bzk . brp . model . objecttype . operationeel . BetrokkenheidModel ) ( org . mockito . Matchers . any ( ) ) ) ) ; "<AssertPlaceHolder>" ; } size ( ) { return elementen . size ( ) ; }
putListeners ( ) { final java . util . concurrent . atomic . AtomicInteger counter = new java . util . concurrent . atomic . AtomicInteger ( ) ; map . addListener ( new LRUMap . ModificationListener < java . lang . String , java . lang . Integer > ( ) { @ java . lang . Override public void onPut ( java . lang . String key , java . lang . Integer value ) { counter . incrementAndGet ( ) ; } @ java . lang . Override public void onRemove ( java . lang . String key , java . lang . Integer value ) { } } ) ; map . put ( "hello" , 1 ) ; map . put ( "hello2" , 2 ) ; "<AssertPlaceHolder>" ; } put ( java . lang . String , org . codehaus . httpcache4j . List ) { return super . put ( new org . codehaus . httpcache4j . util . CaseInsensitiveKey ( key ) , value ) ; }
testStatelessKieSession ( ) { org . kie . api . runtime . StatelessKieSession ksession = ( ( org . kie . api . runtime . StatelessKieSession ) ( org . kie . spring . tests . KieSpringComponentScanTest . context . getBean ( "ksession1" ) ) ) ; "<AssertPlaceHolder>" ; }
shouldHashSha1 ( ) { java . lang . String [ ] correctHashes = new java . lang . String [ ] { "da39a3ee5e6b4b0d3255bfef95601890afd80709" , "5baa61e4c9b93f3f0682250b6cf8331b7ee68fd8" , "285d0c707f9644b75e1a87a62f25d0efb56800f0" , "a42ef8e61e890af80461ca5dcded25cbfcf407a4" } ; java . util . List < java . lang . String > result = new java . util . ArrayList ( ) ; for ( java . lang . String password : fr . xephi . authme . security . HashUtilsTest . GIVEN_PASSWORDS ) { result . add ( fr . xephi . authme . security . HashUtils . sha1 ( password ) ) ; } "<AssertPlaceHolder>" ; } contains ( java . lang . String ) { return ( getObject ( path ) ) != null ; }
equalsOtherNullReturnsFalse ( ) { com . rackspacecloud . blueflood . types . BluefloodCounterRollup rollup = new com . rackspacecloud . blueflood . types . BluefloodCounterRollup ( ) ; "<AssertPlaceHolder>" ; } equals ( java . lang . Object ) { if ( ! ( obj instanceof com . rackspacecloud . blueflood . rollup . Granularity ) ) return false ; else return obj == ( this ) ; }
testFlatten ( ) { org . teiid . translator . document . Document doc = new org . teiid . translator . document . Document ( ) ; doc . addProperty ( "B" 4 , "AA" ) ; doc . addProperty ( "B" , "B" 2 ) ; org . teiid . translator . document . Document c1 = new org . teiid . translator . document . Document ( "c1" , false , doc ) ; c1 . addProperty ( "B" 1 , "11" ) ; org . teiid . translator . document . Document c2 = new org . teiid . translator . document . Document ( "c1" , false , doc ) ; c2 . addProperty ( "B" 3 , "B" 7 ) ; doc . addChildDocuments ( "c1" , java . util . Arrays . asList ( c1 , c2 ) ) ; org . teiid . translator . document . Document c4 = new org . teiid . translator . document . Document ( "c2" , false , doc ) ; c4 . addProperty ( "4" , "B" 0 ) ; org . teiid . translator . document . Document c5 = new org . teiid . translator . document . Document ( "c2" , false , doc ) ; c5 . addProperty ( "5" , "B" 6 ) ; doc . addChildDocuments ( "c2" , java . util . Arrays . asList ( c4 , c5 ) ) ; java . util . List < java . util . Map < java . lang . String , java . lang . Object > > result = doc . flatten ( ) ; java . util . List < java . util . Map < java . lang . String , java . lang . Object > > expected = java . util . Arrays . asList ( map ( "B" 4 , "AA" , "B" , "B" 2 , "c1/1" , "11" , "B" 5 , "B" 0 ) , map ( "B" 4 , "AA" , "B" , "B" 2 , "c1/2" , "B" 7 , "B" 5 , "B" 0 ) , map ( "B" 4 , "AA" , "B" , "B" 2 , "c1/1" , "11" , "c2/5" , "B" 6 ) , map ( "B" 4 , "AA" , "B" , "B" 2 , "c1/2" , "B" 7 , "c2/5" , "B" 6 ) ) ; "<AssertPlaceHolder>" ; } toArray ( ) { return java . util . Arrays . copyOf ( elementData , size ) ; }
testSetUnread ( ) { contact . setUnread ( 1 ) ; "<AssertPlaceHolder>" ; } getUnread ( ) { return unread ; }
testOracleDatabase ( ) { try { java . lang . String expectedSQL = ( org . pentaho . di . core . database . SelectCountIT . NonHiveSelect ) + ( org . pentaho . di . core . database . SelectCountIT . TableName ) ; org . pentaho . di . core . database . DatabaseMeta databaseMeta = new org . pentaho . di . core . database . DatabaseMeta ( org . pentaho . di . core . database . SelectCountIT . OracleDatabaseXML ) ; java . lang . String sql = databaseMeta . getDatabaseInterface ( ) . getSelectCountStatement ( org . pentaho . di . core . database . SelectCountIT . TableName ) ; "<AssertPlaceHolder>" ; } catch ( java . lang . Exception e ) { e . printStackTrace ( ) ; } } getSelectCountStatement ( java . lang . String ) { if ( ( databaseDialect ) != null ) { return databaseDialect . getSelectCountStatement ( tableName ) ; } return super . getSelectCountStatement ( tableName ) ; }

Expected output:预计 output:

linenumber,length
5,5000
10,3850
2,2000
...

You can use rquery ( https://github.com/fuyuncat/rquery ).您可以使用 rquery ( https://github.com/fuyuncat/rquery )。

[ rquery]$ ./rq -q "s @line+', '+strlen(@raw) | o strlen(@raw) desc" test.1 | head -10
73, 782
97, 760
72, 749
98, 749
63, 723
109, 711
108, 700
141, 626
5, 622
77, 619

Using GNU awk:使用 GNU awk:

$ gawk '{
    a[NR]=length
}   
END {
    PROCINFO["sorted_in"]="@val_num_desc"  # this line is GNU awk only
    for(i in a)
        print i,a[i]
}' file                      

Output: Output:

1 1717
6 1649
8 883
2 762
4 656
5 375
3 268
7 107

If you don't have GNU awk but some other awk and sort pipe the output to:如果你没有 GNU awk 但其他一些 awk 并sort pipe 到 Z78E6621F639ZF1D4

$ awk ... | sort -t\  -k2nr

Using any version of these tools that exist on every Unix box:使用每个 Unix 盒子上存在的这些工具的任何版本:

$ awk -v OFS=',' '{print NR, length($0)}' file | sort -t, -rnk2 | head -n 5
1,1717
6,1649
8,883
2,762
4,656

Just add echo 'linenumber,length';只需添加echo 'linenumber,length'; at the start to get a header line if you really want it.如果你真的想要的话,一开始就买一条 header 线。

The above is outputting 5 lines instead of 10 to demonstrate selection of largest-N since OP only provided 8 lines of sample input.上面输出 5 行而不是 10 行来演示最大 N 的选择,因为 OP 只提供了 8 行样本输入。

You can try below.你可以试试下面。

cat input.txt|awk 'BEGIN{print "linenumber,length"};{size[NR] = length;}END{for(lineNumber in size) print lineNumber","size[lineNumber]}'|sort -k2 -rn|head -10

UPDATE 2: BENCHMARKING更新 2:基准测试

After adding a Top-10-only filter to another solution above, here's how it fares for the same input set - 11.896 secs在上面的另一个解决方案中添加了仅前 10 个过滤器后,这是相同输入集的票价 - 11.896 秒

fgc; ( time ( pvE0 < "${m3t}" | 

gawk '{
    a[NR]=length
}
END {
    PROCINFO["sorted_in"]="@val_num_desc"  # this line is GNU awk only
    for(i in a) { 
        print i,a[i]; if (9 < ++_) { break } } }' ) | 

pvE9 ) | gcat -b | lgp3 5

      in0: 1.85GiB 0:00:08 [ 228MiB/s] [ 228MiB/s] [=>] 100%            
     out9: 0.00 B 0:00:11 [0.00 B/s] [0.00 B/s] [<=>]

     1  6954837 18458
     2  11417380 14247
     3  6654331 11188
     4  7576850 10352
     5  12262953 10182

     6  12279191 10156
     7  12329231 9679
     8  11479085 9568
     9  12329230 9400
    10  12418983 8666

     out9:  143 B 0:00:11 [12.0 B/s] [12.0 B/s] [<=> ]
( pvE 0.1 in0 < "${m3t}" | gawk ; )  

11.49s user 0.77s system 103% cpu 11.896 total

UPDATE 1:更新 1:

if you're willing to take a leap of faith and assume the input is fully valid UTF-8 to begin with, then by adding this small function, it can report line #, byte-count, and also UTF-8 chars count如果您愿意大胆尝试并假设输入是完全有效的UTF-8开始,那么通过添加这个小 function,它可以报告行号、字节数以及UTF-8

 function _______(_) { # only applicable for non-unicode aware awks _=$(_<_) gsub("[\\\200-\\\301\\\365-\\\377]+","",_) return length(_) }
  in0: 1.85GiB 0:00:01 [1.63GiB/s] [1.63GiB/s] [========>] 100% 
        
 ( pvE 0.1 in0 < "${m3t}" | mawk2 ; )  

  0.93s user 0.42s system 117% cpu 1.152 total


 1  index ::  2   #-bytes ::  16024  |  Ln::  12417761  |  #-utf8-chars :: 8663
 2  index ::  3   #-bytes ::  16033  |  Ln::  12418983  |  #-utf8-chars :: 8666
 3  index ::  4   #-bytes ::  22261  |  Ln::  11417380  |  #-utf8-chars :: 14247
 4  index ::  5   #-bytes ::  20574  |  Ln::  6654331   |  #-utf8-chars :: 11188

 5  index ::  6   #-bytes ::  20714  |  Ln::  12329231  |  #-utf8-chars :: 9679
 6  index ::  7   #-bytes ::  20077  |  Ln::  12329230  |  #-utf8-chars :: 9400
 7  index ::  8   #-bytes ::  18870  |  Ln::  3781376   |  #-utf8-chars :: 8416
 8  index ::  9   #-bytes ::  16801  |  Ln::  9000781   |  #-utf8-chars :: 8459

 9  index ::  0   #-bytes ::  25891  |  Ln :: 6954837   |  #-utf8-chars :: 18458
10  index ::  1   #-bytes ::  16051  |  Ln :: 11479085  |  #-utf8-chars :: 9568

======================== =========================

It's muuuuuuuuuuuch faster doing it in awk - not even 1.2 secs to finish a 1.85 GB text file filled with utf-8 :awk中执行此awk速度更快,甚至不到1.2 secs即可完成一个1.85 GB的文本文件,其中填充了utf-8

  • instead of storing every single line, it only updates entries in the 10-item array whenever the shortest existing entry will get knocked down to 11-th spot而不是存储每一行,它只更新 10 项数组中的条目,只要最短的现有条目将被击倒到第 11 个位置

  • since ties favor existing entries, the array is seldomly updated at all由于关系有利于现有条目,因此该数组很少更新

  • it also temp stores the shortest entry into a variable, which is a lot less overhead than reading from and writing to the array for every one of the 12 million rows in the test file它还将最短的条目临时存储到变量中,这比在测试文件中的 1200 万行中的每一行中读取和写入数组的开销要少得多

| |

fgc; ( time ( pvE0 < "${m3t}" | 

mawk2 '
function ______(___,_,__,____,_____) { 
               __=(_=3)^_^_
         _____=""
    for(____ in ___) {
         _____=__==(__=+(_=___[____])<+__\
               ?_:__) ?_____:____ 
    } return _____ 
 } 
 BEGIN { 
    split(sprintf("%0*.f",
         (__=10)-!_,_),___,_)

    _____=___[+_] = _*= \
               FS =  "^$" 
 } _____<(____=length($!__)) { 
   
    ___[_]=____ "_|_LineNum_::_"NR
          _____=+___[_=______(___)] 
 } END { 
      for(____=__+_;_<____;_++) { 
          print "index :: ",_%__,"_length :: ",___[_%__] } } ' )) 

  sleep 1

  ( time ( pvE0 < "${m3t}" | 
           mawk2 '{ print length($0),NR }' OFS== ) |
  LC_ALL=C gsort -t= -k 1,1nr -k 2,2nr ) |
  gsed -n '1,10p;10q'                    | 
  gsort -t= -k 1,1n | gcat -b | rs -t -c$'\n' -C= 0 3 | column -s= -t
 
 in0: 1.85GiB 0:00:01 [1.64GiB/s] [1.64GiB/s] [============>] 100%            
 
 0.93s user 0.42s system 117% cpu 1.145 total

 1  index ::  2   length ::  16024  |  LineNum  ::  12417761
 2  index ::  3   length ::  16033  |  LineNum  ::  12418983
 3  index ::  4   length ::  22261  |  LineNum  ::  11417380
 4  index ::  5   length ::  20574  |  LineNum  ::  6654331

 5  index ::  6   length ::  20714  |  LineNum  ::  12329231
 6  index ::  7   length ::  20077  |  LineNum  ::  12329230
 7  index ::  8   length ::  18870  |  LineNum  ::  3781376
 8  index ::  9   length ::  16801  |  LineNum  ::  9000781

 9  index ::  0   length ::  25891  |  LineNum  ::  6954837
10  index ::  1   length ::  16051  |  LineNum  ::  11479085


  in0: 1.85GiB 0:00:07 [ 247MiB/s] [ 247MiB/s] [=========>] 100%            

 1  16024  12417761       5 18870  3781376        9 22261  11417380
 2  16033  12418983       6 20077  12329230      10 25891  6954837
 3  16051  11479085       7 20574  6654331
 4  16801  9000781        8 20714  12329231


2.63s user 0.42s system 39% cpu 7.681 total
5.85s user 0.51s system 72% cpu 8.808 total

声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.

 
粤ICP备18138465号  © 2020-2024 STACKOOM.COM