9.7. Examples: Reading specific columns from a file or a list of files

Print the first column of each file In order to print the first column of these files, we will use awk utility. As it was shown before, the command for this utility consists on some optional flags followed by an action statement and then the list of files. In this case the action statement is '{print $1}', because we want to print only the first column ($1). Files file2.txt and file4.txt use a space as a column separator (which is the separator for default). So, to access the first column of these files we don't need the -F flag. However, files file1.csv and file3.csv use a comma as a separator. So, in order for awk to distinguish the different columns we have to use the -F flag. In this case, the parameter of the -F flag is a comma (-F','). If you are trying to read a file which has the columns separated by a different character, then use that character instead of ','.

Space-separated files Comma-separated files
$ awk '{print $1}' file2.txt $ awk -F',' '{print $1}' file1.csv
"AnonymizedID" "Anonymized ID"
"B11130912" "B33199522"
"B11137244" "B33199603"
"B11154534" "B11137879"
"B11144100" "B11144410"
"B11137244" "B11110455"
"B12226566" "B11135291"
"B11134987" "B11153927"
"B11144345" "B11177579"
"C11137159" "B11177806"
"B11156453" "B11157958"
"B11110676" "B11110690"
"C11138929" "B11152799"
"B11154532" "B11154358"
"B11155267" "B11110925"
"B11137120" "B11135291"
"B33191224" "B11135072"
"B11155267" "B33199603"
"C11138999" "B11137879"
"B11131605" "B11131605"
"B11137784" "B11110927"
"B11156098" "B11147712"
"B11133232" "B33191224"
"B11135292" "B11131290"
"C11138912" "B11157974"
"B11150911" "B33191224"
"B11152577" "B11141503"
"B11156098" "C11137159"
"B11133232" "B33199522"

$ awk '{print $1}' file4.txt $ awk -F',' '{print $1}' file3.csv
AnonymizedID Anonymized ID
B11108326 C11138122
B11110893 C11138192
B11119909 B12226507
D11144030 B12226546
D11144030 C11138122
B11119903 C11138184
C11131039 C11138797
C11133100 C11138152
C11135566 C11138150
C11137159 C11137167
C11137159 C11137159
C11137167 C11137167
C11137167 C11137159
C11137439 C11131039
C11137439 C11135566
C11137443 B11119903
C11137544 C11137544
C11137123 C11137443
C11138150 C11137123
C11138152 C11137439
C11138797 C11137439
C11138184 C11133100
C11138122 D11144030
C11138122 B11108399
C11138192 B11108326
B12226507 B11119909
B12226546 B11110893

To precede each line by the line number, add NR, after the string print in the awk command to indicate that you want to print the Number Row before the column 1 ($1):

Space-separated files Comma-separated files
$ awk '{print NR,$1}' file2.txt $ awk -F',' '{print NR, $1}' file1.csv
1 "AnonymizedID" 1 "Anonymized ID"
2 "B11130912" 2 "B33199522"
3 "B11137244" 3 "B33199603"
4 "B11154534" 4 "B11137879"
5 "B11144100" 5 "B11144410"
6 "B11137244" 6 "B11110455"
7 "B12226566" 7 "B11135291"
8 "B11134987" 8 "B11153927"
9 "B11144345" 9 "B11177579"
10 "C11137159" 10 "B11177806"
11 "B11156453" 11 "B11157958"
12 "B11110676" 12 "B11110690"
13 "C11138929" 13 "B11152799"
14 "B11154532" 14 "B11154358"
15 "B11155267" 15 "B11110925"
16 "B11137120" 16 "B11135291"
17 "B33191224" 17 "B11135072"
18 "B11155267" 18 "B33199603"
19 "C11138999" 19 "B11137879"
20 "B11131605" 20 "B11131605"
21 "B11137784" 21 "B11110927"
22 "B11156098" 22 "B11147712"
23 "B11133232" 23 "B33191224"
24 "B11135292" 24 "B11131290"
25 "C11138912" 25 "B11157974"
26 "B11150911" 26 "B33191224"
27 "B11152577" 27 "B11141503"
28 "B11156098" 28 "C11137159"
29 "B11133232" 29 "B33199522"

$ awk '{print NR, $1}' file4.txt $ awk -F',' '{print NR, $1}' file3.csv
1 AnonymizedID 1 Anonymized ID
2 B11108326 2 C11138122
3 B11110893 3 C11138192
4 B11119909 4 B12226507
5 D11144030 5 B12226546
6 D11144030 6 C11138122
7 B11119903 7 C11138184
8 C11131039 8 C11138797
9 C11133100 9 C11138152
10 C11135566 10 C11138150
11 C11137159 11 C11137167
12 C11137159 12 C11137159
13 C11137167 13 C11137167
14 C11137167 14 C11137159
15 C11137439 15 C11131039
16 C11137439 16 C11135566
17 C11137443 17 B11119903
18 C11137544 18 C11137544
19 C11137123 19 C11137443
20 C11138150 20 C11137123
21 C11138152 21 C11137439
22 C11138797 22 C11137439
23 C11138184 23 C11133100
24 C11138122 24 D11144030
25 C11138122 25 B11108399
26 C11138192 26 B11108326
27 B12226507 27 B11119909
28 B12226546 28 B11110893

Print the first column of file1.csv and file2.txt in reverse order In order to print from the last line to the first line, you can use the command tail with the flag -r (for reverse) after the command awk. The command line will first execute the awk command which is written before the | symbol, and then it will run the tail command which inverts the order of the previous output. Remember that for file1.csv you need to use -F',' to indicate that the columns are separated by commas and not spaces.

Space-separated file Comma-separated file
$ awk '{print $1}' file2.txt | tail -r $ awk -F',' '{print $1}' file1.csv | tail -r
"B11133232" "B33199522"
"B11156098" "C11137159"
"B11152577" "B11141503"
"B11150911" "B33191224"
"C11138912" "B11157974"
"B11135292" "B11131290"
"B11133232" "B33191224"
"B11156098" "B11147712"
"B11137784" "B11110927"
"B11131605" "B11110603"
"C11138999" "B11137879"
"B11155267" "B33199603"
"B33191224" "B11135072"
"B11137120" "B11135291"
"B11155267" "B11110925"
"B11154532" "B11154358"
"C11138929" "B11152799"
"B11110676" "B11110690"
"B11156453" "B11157958"
"C11137159" "B11177806"
"B11144345" "B11177579"
"B11134987" "B11153927"
"B12226566" "B11135291"
"B11137244" "B11110455"
"B11144100" "B11144410"
"B11154534" "B11137879"
"B11137244" "B33199603"
"B11130912" "B33199522"
"AnonymizedID" "Anonymized ID"

The same as in example 1, to precede each line by the line number, add NR, after the string print in the awk command to indicate that you want to print the Number Row before the column 1 ($1):

Space-separated file Comma-separated file
$ awk '{print NR, $1}' file2.txt | tail -r $ awk -F',' '{print NR, $1}' file1.csv | tail -r
29 "B11133232" 29 "B33199522"
28 "B11156098" 28 "C11137159"
27 "B11152577" 27 "B11141503"
26 "B11150911" 26 "B33191224"
25 "C11138912" 25 "B11157974"
24 "B11135292" 24 "B11131290"
23 "B11133232" 23 "B33191224"
22 "B11156098" 22 "B11147712"
21 "B11137784" 21 "B11110927"
20 "B11131605" 20 "B11110603"
19 "C11138999" 19 "B11137879"
18 "B11155267" 18 "B33199603"
17 "B33191224" 17 "B11135072"
16 "B11137120" 16 "B11135291"
15 "B11155267" 15 "B11110925"
14 "B11154532" 14 "B11154358"
13 "C11138929" 13 "B11152799"
12 "B11110676" 12 "B11110690"
11 "B11156453" 11 "B11157958"
10 "C11137159" 10 "B11177806"
9 "B11144345" 9 "B11177579"
8 "B11134987" 8 "B11153927"
7 "B12226566" 7 "B11135291"
6 "B11137244" 6 "B11110455"
5 "B11144100" 5 "B11144410"
4 "B11154534" 4 "B11137879"
3 "B11137244" 3 "B33199603"
2 "B11130912" 2 "B33199522"
1 "AnonymizedID" 1 "Anonymized ID"

Print the second and third columns of file1.csv and file2.txt. Show the lines of file2.txt in reverse order. In the previous examples we used the action statement '{print $1}' to print the first column. Since we now want to print the second and third columns instead of the first one, we replace $1 by $2, $3. If you wanted to print column 4 and 5 instead, then you would simply use $4,$5, etc. In order to print the output in reverse order for file2.txt, use the tail -r command after the awk.

Space-separated file Comma-separated file
$ awk '{print $2,$3}' file2.txt $ awk -F',' '{print $2,$3}' file1.csv | tail -r
"SubjectGroup" "TEST1" "Group1" "0"
"Group2b" "900" "Group3" "9"
"Group1" "450" "Group3" "0"
"Group1" "456" "Group2 b" "0"
"Group1" "450" "Group1" "0"
"Group1" "450" "Group2 b" "0"
"Group2b" "450" "Group2 b" "0"
"Group1" "900" "Group1" "0"
"Group1" "900" "Group1" "0"
"Group3" "MissingData" "Group1" "0"
"Group4" "456" "Group1" "0"
"Group1" "900" "Group3" "0"
"Group2b" "2" "MISSING" "0"
"Group1" "456" "Group3" "9"
"Group3" "900" "Group1" "0"
"Group2b" "450" "Group1" "0"
"Group2b" "450" "Group1" "0"
"Group3" "900" "Group3" "0"
"Group2b" "900" "Group3" "0"
"Group1" "456" "Group1" "MD"
"Group1" "900" "Group2 b" "0"
"Group1" "500" "Group1" "0"
"Group1" "500" "Group3" "0"
"Group3" "MissingData" "Group2 b" "0"
"Group2b" "900" "Group2 b" "0"
"Group2b" "900" "Group1" "0"
"Group1" "900" "Group3" "0"
"Group1" "456" "Group1" "0"
"Group1" "456" "Subject Group" "HASCONDITION"

To precede each line by the line number, add NR, after the string print in the awk command to indicate that you want to print the Number Row before the column 1 ($1):

Space-separated file Comma-separated file
$ awk '{print NR,$2,$3}' file2.txt $ awk -F',' '{print NR,$2,$3}' file1.csv | tail -r
1 "SubjectGroup" "TEST1" 29 "Group1" "0"
2 "Group2b" "900" 28 "Group3" "9"
3 "Group1" "450" 27 "Group3" "0"
4 "Group1" "456" 26 "Group2 b" "0"
5 "Group1" "450" 25 "Group1" "0"
6 "Group1" "450" 24 "Group2 b" "0"
7 "Group2b" "450" 23 "Group2 b" "0"
8 "Group1" "900" 22 "Group1" "0"
9 "Group1" "900" 21 "Group1" "0"
10 "Group3" "MissingData" 20 "Group1" "0"
11 "Group4" "456" 19 "Group1" "0"
12 "Group1" "900" 18 "Group3" "0"
13 "Group2b" "2" 17 "MISSING" "0"
14 "Group1" "456" 16 "Group3" "9"
15 "Group3" "900" 15 "Group1" "0"
16 "Group2b" "450" 14 "Group1" "0"
17 "Group2b" "450" 13 "Group1" "0"
18 "Group3" "900" 12 "Group3" "0"
19 "Group2b" "900" 11 "Group3" "0"
20 "Group1" "456" 10 "Group1" "MD"
21 "Group1" "900" 9 "Group2 b" "0"
22 "Group1" "500" 8 "Group1" "0"
23 "Group1" "500" 7 "Group3" "0"
24 "Group3" "MissingData" 6 "Group2 b" "0"
25 "Group2b" "900" 5 "Group2 b" "0"
26 "Group2b" "900" 4 "Group1" "0"
27 "Group1" "900" 3 "Group3" "0"
28 "Group1" "456" 2 "Group1" "0"
29 "Group1" "456" 1 "Subject Group" "HASCONDITION"

Print all the columns of file1.csv showing the lines in reverse order. To print all the columns of a file using awk simply use $0 (instead of a column number). Or use the command cat.

Using awk Using cat
$ awk -F',' '{print $0}' file1.csv | tail -r $ cat file1.csv | tail -r
"B33199522","Group1","0","" "B33199522","Group1","0",""
"C11137159","Group3","9","mTBI" "C11137159","Group3","9","mTBI"
"B11141503","Group3","0","" "B11141503","Group3","0",""
"B33191224","Group2 b","0","" "B33191224","Group2 b","0",""
"B11157974","Group1","0","" "B11157974","Group1","0",""
"B11131290","Group2 b","0","" "B11131290","Group2 b","0",""
"B33191224","Group2 b","0","" "B33191224","Group2 b","0",""
"B11147712","Group1","0","" "B11147712","Group1","0",""
"B11110927","Group1","0","" "B11110927","Group1","0",""
"B11110603","Group1","0","" "B11110603","Group1","0",""
"B11137879","Group1","0","" "B11137879","Group1","0",""
"B33199603","Group3","0","" "B33199603","Group3","0",""
"B11135072","MISSING","0","" "B11135072","MISSING","0",""
"B11135291","Group3","9","mTBI" "B11135291","Group3","9","mTBI"
"B11110925","Group1","0","" "B11110925","Group1","0",""
"B11154358","Group1","0","" "B11154358","Group1","0",""
"B11152799","Group1","0","" "B11152799","Group1","0",""
"B11110690","Group3","0","" "B11110690","Group3","0",""
"B11157958","Group3","0","" "B11157958","Group3","0",""
"B11177806","Group1","MD","" "B11177806","Group1","MD",""
"B11177579","Group2 b","0","" "B11177579","Group2 b","0",""
"B11153927","Group1","0","" "B11153927","Group1","0",""
"B11135291","Group3","0","" "B11135291","Group3","0",""
"B11110455","Group2 b","0","" "B11110455","Group2 b","0",""
"B11144410","Group2 b","0","" "B11144410","Group2 b","0",""
"B11137879","Group1","0","" "B11137879","Group1","0",""
"B33199603","Group3","0","" "B33199603","Group3","0",""
"B33199522","Group1","0","" "B33199522","Group1","0",""
"Anonymized ID","Subject Group","HASCONDITION","CONDITION" "Anonymized ID","Subject Group","HASCONDITION","CONDITION"

Print all the columns of file1.csv in reversed order (first the third column, then the second and finally the first one), and save the re-ordered columns in a new file called file1_reordered.csv. If you were going to print the columns one to three in normal order, you would use '{print $1,$2,$3}'. To print them in reverse order, you just reverse the order of the columns in print: '{print $3,$2,$1}'.
To save the output to a file instead of showing it in the terminal, use >> file as explained in previous sections.
Remember to use the -F',' flag to indicate that the columns are separated by commas and not the default space.

$ awk -F',' '{print $3,$2,$1}' file1.csv >> file1_reordered.csv
$ cat file1_reordered.csv

"HASCONDITION" "Subject Group" "Anonymized ID"
"0" "Group1" "B33199522"
"0" "Group3" "B33199603"
"0" "Group1" "B11137879"
"0" "Group2 b" "B11144410"
"0" "Group2 b" "B11110455"
"0" "Group3" "B11135291"
"0" "Group1" "B11153927"
"0" "Group2 b" "B11177579"
"MD" "Group1" "B11177806"
"0" "Group3" "B11157958"
"0" "Group3" "B11110690"
"0" "Group1" "B11152799"
"0" "Group1" "B11154358"
"0" "Group1" "B11110925"
"9" "Group3" "B11135291"
"0" "MISSING" "B11135072"
"0" "Group3" "B33199603"
"0" "Group1" "B11137879"
"0" "Group1" "B11110603"
"0" "Group1" "B11110927"
"0" "Group1" "B11147712"
"0" "Group2 b" "B33191224"
"0" "Group2 b" "B11131290"
"0" "Group1" "B11157974"
"0" "Group2 b" "B33191224"
"0" "Group3" "B11141503"
"9" "Group3" "C11137159"
"0" "Group1" "B33199522"

Print all the columns of file1.csv in reversed order (first the third column, then the second and finally the first one). Print the lines also in reverse order (from the last line to the first one).
Use the same command as before, adding | tail -r at the end to invert also the lines.

$ awk -F',' '{print $3,$2,$1}' file1.csv | tail -r
"0" "Group1" "B33199522"
"9" "Group3" "C11137159"
"0" "Group3" "B11141503"
"0" "Group2 b" "B33191224"
"0" "Group1" "B11157974"
"0" "Group2 b" "B11131290"
"0" "Group2 b" "B33191224"
"0" "Group1" "B11147712"
"0" "Group1" "B11110927"
"0" "Group1" "B11110603"
"0" "Group1" "B11137879"
"0" "Group3" "B33199603"
"0" "MISSING" "B11135072"
"9" "Group3" "B11135291"
"0" "Group1" "B11110925"
"0" "Group1" "B11154358"
"0" "Group1" "B11152799"
"0" "Group3" "B11110690"
"0" "Group3" "B11157958"
"MD" "Group1" "B11177806"
"0" "Group2 b" "B11177579"
"0" "Group1" "B11153927"
"0" "Group3" "B11135291"
"0" "Group2 b" "B11110455"
"0" "Group2 b" "B11144410"
"0" "Group1" "B11137879"
"0" "Group3" "B33199603"
"0" "Group1" "B33199522"
"HASCONDITION" "Subject Group" "Anonymized ID"

Read the second column of file1.csv and file2.txt and save it into an array. When saving a column of a file into an array, you must specify that the elements of the array are separated by new lines ('\n'). You do this using the command IFS=$'\n'.
The elements of the array will be saved in the variable ARRAY. As it was learned in previous chapters, to access the individual elements of ARRAY you use the syntax ${ARRAY[index]}. With index starting at 0. So, to access the first element the command is echo ${ARRAY[0]}, to access the second element echo ${ARRAY[1]}, etc. Type echo ${ARRAY[@]} to view all elements and to obtain the number of elements (size of ARRAY).
Remember, the system variable IFS contains the separator that is being used to separate each feld within the lines of a file. You can change the value of this variable at any time: IFS='character'

Space-separated file Comma-separated file
$ IFS=$'\n'
$ ARRAY=($(awk '{print $2}' file2.txt))
$ echo ${ARRAY[0]}
"SubjectGroup"
$ echo ${ARRAY[1]}
"Group2b"
$ echo ${ARRAY[@]}
"SubjectGroup" "Group2b" "Group1" "Group1" "Group1" "Group1" "Group2b" "Group1" "Group1" "Group3" "Group4" "Group1" "Group2b" "Group1" "Group3" "Group2b" "Group2b" "Group3" "Group2b" "Group1" "Group1" "Group1" "Group1" "Group3" "Group2b" "Group2b" "Group1" "Group1" "Group1"
$ echo ${#ARRAY[@]}
29
$ IFS=$'\n'
$ ARRAY=($(awk -F',' '{print $2}' file1.csv))
$ echo ${ARRAY[0]}
"Subject Group"
$ echo ${ARRAY[1]}
"Group1"
$ echo ${ARRAY[@]}
"Subject Group" "Group1" "Group3" "Group1" "Group2 b" "Group2 b" "Group3" "Group1" "Group2 b" "Group1" "Group3" "Group3" "Group1" "Group1" "Group1" "Group3" "MISSING" "Group3" "Group1" "Group1" "Group1" "Group1" "Group2 b" "Group2 b" "Group1" "Group2 b" "Group3" "Group3" "Group1"
$ echo ${#ARRAY[@]}
29

Print the first column of file2.txt followed by the first column of file4.txt. To print a specific column for more than one file, you use the same command, adding the list of files you want to print after the first one. However, all the files in the list must use the same column separator (in this case is a space). Since the column separator for this list of files is a space (the default), you don't need to use the -F flag.

$ awk '{print $1}' file2.txt file4.txt
"AnonymizedID"
"B11130912"
"B11137244"
"B11154534"
"B11144100"
"B11137244"
"B12226566"
"B11134987"
"B11144345"
"C11137159"
"B11156453"
"B11110676"
"C11138929"
"B11154532"
"B11155267"
"B11137120"
"B33191224"
"B11155267"
"C11138999"
"B11131605"
"B11137784"
"B11156098"
"B11133232"
"B11135292"
"C11138912"
"B11150911"
"B11152577"
"B11156098"
"B11133232"
AnonymizedID
B11108326
B11110893
B11119909
D11144030
D11144030
B11119903
C11131039
C11133100
C11135566
C11137159
C11137159
C11137167
C11137167
C11137439
C11137439
C11137443
C11137544
C11137123
C11138150
C11138152
C11138797
C11138184
C11138122
C11138122
C11138192
B12226507
B12226546

Print the first column of file3.csv followed by the first column of file1.csv. Since the column separator for this list of files is a comma, you need to use the -F',' flag.

$ awk -F',' '{print $1}' file3.csv file1.csv
Anonymized ID
C11138122
C11138192
B12226507
B12226546
C11138122
C11138184
C11138797
C11138152
C11138150
C11137167
C11137159
C11137167
C11137159
C11131039
C11135566
B11119903
C11137544
C11137443
C11137123
C11137439
C11137439
C11133100
D11144030
B11108399
B11108326
B11119909
B11110893
"Anonymized ID"
"B33199522"
"B33199603"
"B11137879"
"B11144410"
"B11110455"
"B11135291"
"B11153927"
"B11177579"
"B11177806"
"B11157958"
"B11110690"
"B11152799"
"B11154358"
"B11110925"
"B11135291"
"B11135072"
"B33199603"
"B11137879"
"B11110603"
"B11110927"
"B11147712"
"B33191224"
"B11131290"
"B11157974"
"B33191224"
"B11141503"
"C11137159"
"B33199522"