自作のtail(その7)

前回まではByte単位でしたが、今回から行単位の抽出処理を作っていきます。

tailコマンドでファイル末尾を表示する際、ファイルサイズが小さければ、全部読み込んだ後、所望のファイル末尾だけ表示する処理でも良いですが、大きなファイルだと、おそらく時間もリソースも無駄に使ってしまいます。まだ作ってないので、おそらく、ですが。
なので、あまり大きくないブロック単位で読み込んで効率的で処理するようなプログラムにしていこうと思います。

大きなファイルを扱う前に、まずは、小さなファイルを対象に処理を作っていきます。

takk@deb9:~/tmp$ seq 12 | tee a.txt
1
2
3
4
5
6
7
8
9
10
11
12
takk@deb9:~/tmp$ wc -c a.txt
27 a.txt
takk@deb9:~/tmp$

このような小さなファイルを、行末から10Byte毎に読み込みする処理にします。
ダンプするとこのように改行が入っています。

takk@deb9:~/tmp$ od -tx1c -Ad -w10 a.txt
0000000  31  0a  32  0a  33  0a  34  0a  35  0a
          1  \n   2  \n   3  \n   4  \n   5  \n
0000010  36  0a  37  0a  38  0a  39  0a  31  30
          6  \n   7  \n   8  \n   9  \n   1   0
0000020  0a  31  31  0a  31  32  0a
         \n   1   1  \n   1   2  \n
0000027
takk@deb9:~/tmp$

10 Byteなので改行を入れると、1回目の読み込みは、

takk@deb9:~/tmp$ od -tx1c -Ad -j17 a.txt
0000017  0a  31  30  0a  31  31  0a  31  32  0a
         \n   1   0  \n   1   1  \n   1   2  \n
0000027
takk@deb9:~/tmp$

となり、2回目は、

takk@deb9:~/tmp$ od -tx1c -Ad -j7 -N10 a.txt
0000007  0a  35  0a  36  0a  37  0a  38  0a  39
         \n   5  \n   6  \n   7  \n   8  \n   9
0000017
takk@deb9:~/tmp$

3回目は、

takk@deb9:~/tmp$ od -tx1c -Ad -N7 a.txt
0000000  31  0a  32  0a  33  0a  34
          1  \n   2  \n   3  \n   4
0000007
takk@deb9:~/tmp$

となり、ファイルサイズが27 Byteで、10 Byte毎に読み込むので、3回目だけ7 Byteとなります。

以下は、上で述べたような読み込みができるように、べた書きしたプログラムです。
rbufを読み込んだ後、改行数を数えます。

takk@deb9:~/tmp$ cat -n t.c
     1	#include <stdio.h>
     2	#include <sys/types.h>	//stat
     3	#include <sys/stat.h>	//stat
     4	#include <unistd.h>	//stat
     5	#include <getopt.h>	//getopt
     6	#include <stdlib.h>	//atoi
     7	#include <fcntl.h>	//open
     8	
     9	int main(int argc, char* argv[])
    10	{
    11		int fd;
    12		char *fname;
    13		struct stat fs;
    14	
    15		char rbuf[10];
    16		int lf_num=0;
    17		int i,len;
    18	
    19		fname = argv[1];
    20	
    21		stat(fname, &fs);
    22	
    23		fd=open(fname, O_RDONLY);
    24	
    25			lseek(fd, -10, SEEK_END);
    26			len = read(fd, rbuf, 10);
    27			for(i = 0; i < len; i++){
    28				if(rbuf[i] == 0x0a){
    29					lf_num++;
    30				}
    31			}
    32	
    33			lseek(fd, -20, SEEK_END);
    34			len = read(fd, rbuf, 10);
    35			for(i = 0; i < len; i++){
    36				if(rbuf[i] == 0x0a){
    37					lf_num++;
    38				}
    39			}
    40	
    41			lseek(fd, -27, SEEK_END);
    42			len = read(fd, rbuf, 7);
    43			for(i = 0; i < len; i++){
    44				if(rbuf[i] == 0x0a){
    45					lf_num++;
    46				}
    47			}
    48	
    49			printf("lf_num=%d\n",lf_num);
    50	
    51		close(fd);
    52	
    53		return 0;
    54	}
takk@deb9:~/tmp$

実行。改行の数が正しく表示されました。

takk@deb9:~/tmp$ gcc t.c
takk@deb9:~/tmp$ ./a.out a.txt
lf_num=12
takk@deb9:~/tmp$

上のべた書き部分をfor文でまとめてみます。

takk@deb9:~/tmp$ cat -n t.c

～省略～

    16          int lf_num=0;
    17          int bsize=sizeof(rbuf);
    18          int i,len,size,bno,
    19
    20          fname = argv[1];
    21
    22          stat(fname, &fs);
    23
    24          fd=open(fname, O_RDONLY);
    25
    26          for(bno=0; bno <= fs.st_size/bsize; bno++){
    27                  size = bsize*(bno+1);
    28                  size = fs.st_size < size ? fs.st_size : size;
    29                  lseek(fd, -size, SEEK_END);
    30
    31                  size = size % bsize;
    32                  size = size > 0 ? size : bsize;
    33                  len = read(fd, rbuf, size);
    34
    35                  for(i = 0; i < len; i++){
    36                          if(rbuf[len-i-1] == 0x0a){
    37                                  lf_num++;
    38                          }
    39                  }
    40          }
    41
    42          printf("lf_num=%d\n",lf_num);
    43
    44          close(fd);
    45
    46          return 0;
    47  }
takk@deb9:~/tmp$

結果は同じく、

lf_num=12

が表示されます。

ここまでできたら、抽出行数と、発見した改行数が一致するところを見つければ、ファイル末尾の行抽出ができます。

takk@deb9:~/tmp$ cat -n t.c
     1	#include <stdio.h>
     2	#include <sys/types.h>	//stat
     3	#include <sys/stat.h>	//stat
     4	#include <unistd.h>	//stat
     5	#include <getopt.h>	//getopt
     6	#include <stdlib.h>	//atoi
     7	#include <fcntl.h>	//open
     8	
     9	int main(int argc, char* argv[])
    10	{
    11		int fd;
    12		char *fname;
    13		struct stat fs;
    14	
    15		char rbuf[10];
    16		char wbuf[100];	//for output
    17		int lf_num=0;
    18		int bsize=sizeof(rbuf);
    19		int req_lines=10;
    20		int lf_index;
    21		int i,len,size,bno;
    22	
    23		fname = argv[1];
    24		req_lines = atoi(argv[2]);
    25	
    26		stat(fname, &fs);
    27	
    28		fd=open(fname, O_RDONLY);
    29	
    30		for(bno=0; bno <= fs.st_size/bsize; bno++){
    31			size = bsize*(bno+1);
    32			size = fs.st_size < size ? fs.st_size : size;
    33			lseek(fd, -size, SEEK_END);
    34	
    35			size = size % bsize;
    36			size = size > 0 ? size : bsize;
    37			len = read(fd, rbuf, size);
    38	
    39			for(i = 0; i < len; i++){
    40				if(rbuf[len-i-1] == 0x0a){
    41					if(!((bno == 0) && (i==0))){
    42						lf_num++;
    43					}
    44					if(lf_num == req_lines){
    45						goto finish;
    46					}
    47				}
    48			}
    49		}
    50		bno--;
    51	finish:
    52		lf_index=i;
    53		size = bsize*bno+lf_index;
    54		lseek(fd, -size, SEEK_END);
    55		len = read(fd, wbuf, size);
    56	
    57		for(i = 0; i < len; i++){
    58			putchar((int)wbuf[i]);
    59		}
    60		close(fd);
    61	
    62		return 0;
    63	}
takk@deb9:~/tmp$

41行目で判定している、ファイル末尾の改行を、lf_numにカウントしないのがポイントです。

finishラベル以降では、100 Byteのバッファwbufを使って、再度ファイルを読み込んで、表示しています。

この作りだと、改行の数を数えるために、読み込みを行い、抽出開始する位置が決まったら、再度読み込みを行うので、２度手間のような気もします。まあ、自作tailができたら、答え合わせで本物tailのソースを確認しようと思っています。

さて、実行結果。ファイル末尾3行抽出してみます。

takk@deb9:~/tmp$ gcc t.c
takk@deb9:~/tmp$ ./a.out a.txt 3
10
11
12
takk@deb9:~/tmp$

パラメータ値を少しずつ増やして確認。

takk@deb9:~/tmp$ for i in {1..13};do ./a.out a.txt $i | xargs echo;done
12
11 12
10 11 12
9 10 11 12
8 9 10 11 12
7 8 9 10 11 12
6 7 8 9 10 11 12
5 6 7 8 9 10 11 12
4 5 6 7 8 9 10 11 12
3 4 5 6 7 8 9 10 11 12
2 3 4 5 6 7 8 9 10 11 12
1 2 3 4 5 6 7 8 9 10 11 12
1 2 3 4 5 6 7 8 9 10 11 12
takk@deb9:~/tmp$

動きは大丈夫そうです。

つづく