问题
I have to read a text file which can begin with optional comments. In practice I have to skip any line at the beginning of the file that doesn't begin with '@' or '>'. In my test case the file looks like:
# Sun Jul 12 22:04:52 2009 /share/apps/corona/bin/filter_fasta.pl --output=/data/results/solid0065/primary.20090712170542775
# Cwd: /state/partition1/home/pipeline
# Title: solid0065_20090629_FC1_Tomate_Heinz_4_5_Kb_Tomate_Heinz_4_5_Kb_01
>125_963_316_F3
T1230330231223011323010013
So I have to skip the first 3 line (but in general I have to skip n lines). I have to repeat this with 2 or 4 files [which are inside FILE** inputFiles]. I've tried with this loop:
buffer = (char*) malloc (sizeof(char) * 5000);
if (buffer == NULL)
notEnoughMemory();
for (i = 0; i < (cIn-1); i++){
fgetpos(inputFiles[i], &position);
fgets(buffer, 4999, inputFiles[i]);
while ((buffer[0] != '@') && (buffer[0] != '>')){
fgetpos(inputFiles[i], &position);
fgets(buffer, 4999, inputFiles[i]);
}
fsetpos(inputFiles[i], &position);
}
Where cIn is number_of_input_files + 1. Trying to debug it the loop correctly stops after it reads the fourth line. But when I use setpos it doesn't go back to the beginning of the fourth line as I'd expect, but at the middle of the third. In fact if, exactly after the fsetpos(), I print buffer after these operations:
fgets(buffer, 4999, inputFiles[i]);
fgets(buffer, 4999, inputFiles[i]);
I get:
FC1_Tomate_Heinz_4_5_Kb_Tomate_Heinz_4_5_Kb_01
>125_963_316_F3
Any idea? Thanks in advance
回答1:
You could just skip processing the lines you are not interrested in:
for (i = 0; i < (cIn-1); i++){
while (fgets(buffer, 4999, inputFiles[i])){
if(buffer[0] == '@' || buffer[0] == '>') {
puts(buffer);
}
/* else do nothing*/
}
}
Then you just replace the puts(buffer);
with the code you need to handle the valid lines.
(allthough, from your example it sounds like you rather want to only ignore lines starting with a #
, ?)
回答2:
Instead of fgetpos(); fsetpos();
you might usefseek(inputFiles[i], -strlen(buffer), SEEK_CUR);
回答3:
(IMHO )Best is to read the entire file into one big buffer (mmap is also an option, if available) , then find and fix the line endings and fasta headers. This will also reduce memory fragmentation. And it simpifies the 'parser' a lot.
EDIT: added source (it is not perfect, but last time I checked it, it worked ;-) Might be incomplete, I snipped it from a larger program.
struct fastapart {
char * name;
char * data;
unsigned size;
struct roedel *friends;
};
struct fastafile {
size_t totsize;
char *tot;
unsigned count;
struct fastapart *parts;
int *alloc;
};
struct fastafile * read_complete_fasta(char *name)
{
int rc,state;
struct fastafile * result;
size_t pos,len,cnt,idx;
struct strbuff *fwd=NULL,*rev = NULL;
result = malloc (sizeof *result);
if (!result) return NULL;
result->tot = read_complete_file(name , &result->totsize);
if (!result->tot) goto failfree;
result->count = 0;
result->parts = NULL;
for (pos=cnt=state=0; pos < result->totsize; ) {
switch (state) {
case 0: /* find first '>' */
if (result->tot[pos] == '>') { pos++; state=2; continue; }
pos += strcspn( result->tot+pos, "\n" );
case 1: /* not found: sync to newline */
if (result->tot[pos] == '\n') { pos++; state=0; continue; }
else pos++;
continue;;
case 2: /* Got '>'; grab name */
len = strcspn( result->tot+pos, " \t\n" );
if (cnt >= result->count) {
size_t siz;
siz = result->count ? 2* result->count: 16;
result->parts = realloc( result->parts
, siz * sizeof *result->parts);
for ( ; result->count < siz;result->count ++) {
result->parts[cnt].name = NULL;
result->parts[cnt].data = NULL;
result->parts[cnt].friends = NULL;
result->parts[cnt].size = 0;
}
}
result->parts[cnt].name = result->tot+pos;
result->parts[cnt].name[len] = 0;
pos += 1+len;
len = strspn( result->tot+pos, " \t\n" );
pos += len;
state++;
continue;
case 3: /* grab data; for the moment, throw away reversed data */
if (result->tot[pos] == '>') {
if (fwd) {
memcpy(result->parts[cnt].data, fwd->data, fwd->used ); result->parts[cnt].size = fwd->used;
result->parts[cnt].data [ fwd->used ] = 0;
fwd->used = 0; }
if (rev) {
/* memcpy(result->parts[cnt].data+result->parts[cnt].size, rev->data, rev->used ); */
rev->used = 0;
}
if (result->parts[cnt].data) cnt++;
pos++; state=2;
continue;
}
len = strcspn( result->tot+pos, "\t\n" );
if (!len) { /* empty line; what to do? skip it! */
fprintf(stderr, "Empty\n" );
pos++; state=1;
continue; }
if (!result->parts[cnt].data) {result->parts[cnt].data = result->tot+pos; }
fwd = strbuff_add(fwd, result->tot+pos, len);
pos += len;
if (result->tot[pos] == '\t' ) {
pos += strspn(result->tot+pos, " \t" );
len = strcspn( result->tot+pos, "\n" );
rev = strbuff_add(rev, result->tot+pos, len);
pos += len;
}
pos += strspn(result->tot+pos, " \t\r\n" );
}}
if (state == 3) {
if (fwd) {
memcpy(result->parts[cnt].data, fwd->data, fwd->used ); result->parts[cnt].size = fwd->used;
result->parts[cnt].data [ fwd->used ] = 0;
fwd->used = 0;
}
if (rev) {
/* memcpy(result->parts[cnt].data+result->parts[cnt].size, rev->data, rev->used ); */
rev->used = 0;
}
if (result->parts[cnt].data) cnt++;
}
/* final realloc */
result->parts = realloc( result->parts, cnt * sizeof *result->parts);
result->count = cnt;
free (fwd);
free (rev);
result->alloc = malloc( result->count * sizeof result->alloc[0] );
if (result->alloc) {
for (cnt = 0; cnt < result->count; cnt++ ) result->alloc[cnt] = cnt;
}
return result;
failfree:
free (fwd);
free (rev);
free (result);
return NULL;
}
char * read_complete_file(char *name, size_t *sizep)
{
int fd, rc;
size_t size, len;
char *result;
struct stat st;
fd = open(name, O_RDONLY);
if (fd == -1) goto fail;
rc = fstat(fd, &st);
if (rc == -1) goto closefail;
result = malloc (1+st.st_size );
if (!result ) goto closefail;
result[st.st_size] = 0;
for (size = 0; size < st.st_size;) {
rc = read(fd, result, st.st_size - size);
if (rc < 0) goto freeclosefail;
size += rc;
}
fprintf(stderr, "Read %lu bytes FROM %s\n"
, (unsigned long) size, name);
close(fd);
*sizep = size;
return result;
freeclosefail:
free(result);
closefail:
close(fd);
fail:
*sizep=0; return NULL;
}
来源:https://stackoverflow.com/questions/7672157/c-trying-to-go-back-to-previous-line-in-the-file