widged.com
 

Examples of Awk mini programs

Basic

stop at R

This will print out all lines unit it reaches one with a capital R.

/R/ { exit } 
{ print $0 }

print non duplicated lines

$0 != prev { print; prev = $0 }

print odd lines only

BEGIN { 
   ln = 0 
}
{ 
  ln++
  if (ln % 2 == 0) next
  print $0
}

(note that % (percent sign) means modulo, ln % 2 gives the rest of the division of ln by 2).

print the words from last to first

{ 
  for(i = NF; i > 0; i--) { print $i }
}

Print out strings that are of a wiki word format

if (match(str, /[A-Z][a-z]+[A-Z][A-Za-z]*/)) { 
print substr(str, RSTART, RLENGTH)   
} 

Print lines that are all capital letters

/^[A-Z][A-Z]*$/ { print $0 } 

skip blank lines

/^$/ { next }   

print the result into a file

print $0 > "output.txt"  

Advanced

Randomly select a word

BEGIN { FS = "\t"; OFS = "\t"; fileIn = ""}
{
  if (fileIn == "") {
    fileIn = FILENAME
    fileOut = fileIn " sa"
  }

  rnd = int(rand() * 1000)
  if ( (rnd % 18 ) == 0 ) {
    print $0 > fileOut
  }
}

Select the first 1000 words in the database

BEGIN { 
  FS = OFS = "\\"
  fileIn = ""
}

{
    if (fileIn == "") {
      fileIn = FILENAME
      fileOut = fileIn " (1000)"
    }

    nbI++
    if (nbI <= 1000) { 
      print $0 > fileOut
    } else { exit }
}

Read a table of lexical statistics, for instance, here read a table with bigram statistics

function readTable() {

while (erF = (getline < bgTable)) {
  if (erF == -1) { print "Fichier '" bgTable "' introuvable!"; exit }  
  else { 

 bg = $1

 frBgType[bg] = $2 + $4 + $6
 frBgToken[bg] = $3 + $5 + $7
 frBgPType[bg ".i"] = $2 + 0
 frBgPToken[bg ".i"] = $3 + 0
 frBgPType[bg ".m"] = $4 + 0
 frBgPToken[bg ".m"] = $5 + 0
 frBgPType[bg ".f"] = $6 + 0
 frBgPToken[bg ".f"] = $7 + 0
 }
 }
}

Merge the various database files from Celex

BEGIN { 

  FS = "\\"; OFS = "\t"
  test = 0
  fileDOW = "DOW_mac"
}

{
   while (getline < fileDOW) {
    l++
    if (test && l > 100) { close(fileDOW); exit }


    ligne = ""
    mis = ""
    idNumb = $1
     ligne = ligne $1 OFS $2 OFS $4 OFS $5 OFS $6 OFS $7 OFS $9

    getline < "DPW_mac"
     ligne = ligne OFS $5 OFS $6 OFS $7
    if (idNumb != $1) { mis = "DPW " $1 " "}

    getline < "DFW_mac"
    ligne = ligne OFS $4 OFS $6 OFS $7
    if (idNumb != $1) { mis = mis "DFW" $1 " "}

    getline < "DMW_mac"
    ligne = ligne OFS $5
    if (idNumb != $1) { mis = mis "DMW" $1 " "}

    print ligne > "Celex Dutch VG"
    if (length(mis) > 2) { 
      print mis OFS ligne > "mismatch" 
      nbmis++
    }
    nbI++

    if (nbI % 1000 == 0) { print nbI }
    
    if (nbmis > 50) { exit }
  }

}

Add zeros at the start of the id numbers, such that the length is homogeneous

Note to put spaces rather than zeros, consider using sprintf(”%10.0”, number)

BEGIN {
FS = "\t"; OFS = "\t"

 fileIn = ""
 zero [1] = "@0"
 for (i = 2; i < 10; i++)
 zero[i] = zero[i-1] "0"
}
{
 
 if (fileIn == "") {
 fileIn = FILENAME
 fileOut = fileIn " @@"
 }
 toMark = $1
 ligne = $0    

 id = zero [6 - length(toMark)] toMark
 print id, ligne > fileOut

}
 
en/computers/lg/awk/snippets.txt · Last modified: 2009/12/14 13:06 by marielle
 
RSS - Banner by widged, template © 7throot HeadQuarters, 2007