C - Trying to go back to previous line in the file

问题

I have to read a text file which can begin with optional comments. In practice I have to skip any line at the beginning of the file that doesn't begin with '@' or '>'. In my test case the file looks like:

# Sun Jul 12 22:04:52 2009 /share/apps/corona/bin/filter_fasta.pl --output=/data/results/solid0065/primary.20090712170542775 
# Cwd: /state/partition1/home/pipeline
# Title: solid0065_20090629_FC1_Tomate_Heinz_4_5_Kb_Tomate_Heinz_4_5_Kb_01
>125_963_316_F3
T1230330231223011323010013

So I have to skip the first 3 line (but in general I have to skip n lines). I have to repeat this with 2 or 4 files [which are inside FILE** inputFiles]. I've tried with this loop:

buffer = (char*) malloc (sizeof(char) * 5000);
if (buffer == NULL)
    notEnoughMemory();

for (i = 0; i < (cIn-1); i++){
    fgetpos(inputFiles[i], &position);
    fgets(buffer, 4999, inputFiles[i]);
    while ((buffer[0] != '@') && (buffer[0] != '>')){
        fgetpos(inputFiles[i], &position);
        fgets(buffer, 4999, inputFiles[i]);
    }
    fsetpos(inputFiles[i], &position);
}

Where cIn is number_of_input_files + 1. Trying to debug it the loop correctly stops after it reads the fourth line. But when I use setpos it doesn't go back to the beginning of the fourth line as I'd expect, but at the middle of the third. In fact if, exactly after the fsetpos(), I print buffer after these operations:

fgets(buffer, 4999, inputFiles[i]);
fgets(buffer, 4999, inputFiles[i]);

I get:

FC1_Tomate_Heinz_4_5_Kb_Tomate_Heinz_4_5_Kb_01
>125_963_316_F3

Any idea? Thanks in advance

回答1:

You could just skip processing the lines you are not interrested in:

for (i = 0; i < (cIn-1); i++){

    while (fgets(buffer, 4999, inputFiles[i])){
       if(buffer[0] == '@' || buffer[0] == '>') {
          puts(buffer);
        }
        /* else do nothing*/
    }
}

Then you just replace the puts(buffer); with the code you need to handle the valid lines. (allthough, from your example it sounds like you rather want to only ignore lines starting with a #, ?)

回答2:

Instead of fgetpos(); fsetpos(); you might use
fseek(inputFiles[i], -strlen(buffer), SEEK_CUR);

回答3:

(IMHO )Best is to read the entire file into one big buffer (mmap is also an option, if available) , then find and fix the line endings and fasta headers. This will also reduce memory fragmentation. And it simpifies the 'parser' a lot.

EDIT: added source (it is not perfect, but last time I checked it, it worked ;-) Might be incomplete, I snipped it from a larger program.

struct fastapart {
  char * name;
  char * data;
  unsigned size;
  struct roedel *friends;
  };
struct fastafile {
  size_t totsize;
  char *tot;
  unsigned count;
  struct fastapart *parts;
  int *alloc;
  };

struct fastafile * read_complete_fasta(char *name)
{
int rc,state;
struct fastafile * result;
size_t pos,len,cnt,idx;
struct strbuff *fwd=NULL,*rev = NULL;

result = malloc (sizeof *result);
if (!result) return NULL;
result->tot = read_complete_file(name , &result->totsize);
if (!result->tot) goto failfree;

result->count = 0;
result->parts = NULL;

for (pos=cnt=state=0; pos < result->totsize; ) {
switch (state) {
case 0: /* find first '>' */
  if (result->tot[pos] == '>') { pos++; state=2; continue; }
  pos += strcspn( result->tot+pos, "\n" );
case 1: /* not found: sync to newline */
  if (result->tot[pos] == '\n') { pos++; state=0; continue; }
  else pos++;
  continue;;
case 2: /* Got '>'; grab name */
  len = strcspn( result->tot+pos, " \t\n" );
  if (cnt >= result->count) {
    size_t siz;
    siz = result->count ? 2* result->count: 16;
    result->parts = realloc( result->parts
      , siz * sizeof *result->parts);
    for (  ; result->count < siz;result->count ++) {
      result->parts[cnt].name = NULL;
      result->parts[cnt].data = NULL;
      result->parts[cnt].friends = NULL;
      result->parts[cnt].size = 0;
      }
    }

  result->parts[cnt].name = result->tot+pos;
  result->parts[cnt].name[len] = 0;
  pos += 1+len;
  len = strspn( result->tot+pos, " \t\n" );
  pos += len;
  state++;
  continue;
case 3: /* grab data; for the moment, throw away reversed data */
  if (result->tot[pos] == '>') {
    if (fwd) {
      memcpy(result->parts[cnt].data, fwd->data, fwd->used ); result->parts[cnt].size = fwd->used;
      result->parts[cnt].data [ fwd->used ] = 0;
      fwd->used = 0; }
    if (rev) {
      /* memcpy(result->parts[cnt].data+result->parts[cnt].size, rev->data, rev->used );  */
      rev->used = 0;
      }
    if (result->parts[cnt].data) cnt++;
    pos++; state=2;
    continue;
    }
  len = strcspn( result->tot+pos, "\t\n" );
  if (!len) { /* empty line; what to do? skip it! */
    fprintf(stderr, "Empty\n" );
    pos++; state=1;
    continue; }
  if (!result->parts[cnt].data) {result->parts[cnt].data = result->tot+pos;  }
  fwd = strbuff_add(fwd, result->tot+pos, len);
  pos += len;
  if (result->tot[pos] == '\t' ) {
    pos += strspn(result->tot+pos, " \t" );
    len = strcspn( result->tot+pos, "\n" );
    rev =  strbuff_add(rev, result->tot+pos, len);
    pos += len;
    }
  pos += strspn(result->tot+pos, " \t\r\n" );
  }}
if (state == 3) {
  if (fwd) {
    memcpy(result->parts[cnt].data, fwd->data, fwd->used ); result->parts[cnt].size = fwd->used;
    result->parts[cnt].data [ fwd->used ] = 0;
    fwd->used = 0;
    }
  if (rev) {
    /* memcpy(result->parts[cnt].data+result->parts[cnt].size, rev->data, rev->used );  */
    rev->used = 0;
    }
  if (result->parts[cnt].data) cnt++;
  }
  /* final realloc */
result->parts = realloc( result->parts, cnt * sizeof *result->parts);
result->count  = cnt;
free (fwd);
free (rev);

result->alloc = malloc( result->count * sizeof result->alloc[0] );
if (result->alloc) {
  for (cnt = 0; cnt <  result->count; cnt++ ) result->alloc[cnt] = cnt;
  }
return result;

failfree:
free (fwd);
free (rev);
free (result);
return NULL;
}

char * read_complete_file(char *name, size_t *sizep)
{
int fd, rc;
size_t size, len;
char *result;

struct stat st;

fd = open(name, O_RDONLY);

if (fd == -1) goto fail;
rc = fstat(fd, &st);
if (rc == -1) goto closefail;
result = malloc (1+st.st_size );
if (!result ) goto closefail;
result[st.st_size] = 0;

for (size = 0; size < st.st_size;) {
  rc = read(fd, result, st.st_size - size);
  if (rc < 0) goto freeclosefail;
  size += rc;
  }

fprintf(stderr, "Read %lu bytes FROM %s\n"
  , (unsigned long) size, name);
close(fd);
*sizep = size;
return result;

freeclosefail:
  free(result);
closefail:
  close(fd);
fail:
  *sizep=0; return NULL;
}

来源：https://stackoverflow.com/questions/7672157/c-trying-to-go-back-to-previous-line-in-the-file

标签

file

line

back