Efficient cython file reading, string parsing, and array building

后端未结

关注

 2  455

So I have some data files that look like this:

      47
   425   425  -3 15000 15000 900   385   315   3   370   330   2   340   330   2
   325   315   2   3


                      
              相关标签:


      
      
        
          2条回答        

        
                         				            
            
           
            
                              
                
              
              
                
                  醉话见心        
                
              
                            
                2020-12-28 11:08
              
            
            
                                                                       
Here is a faster example, it use fast_atoi() to convert string to int, it's 2x faster then get_points_cython() on my pc. If the number of points line have the same width (8 chars), then I think I can speedup it further (about 12x faster then get_points_cython()).

%%cython
import numpy as np
cimport numpy as np
import cython

cdef int fast_atoi(char *buff):
    cdef int c = 0, sign = 0, x = 0
    cdef char *p = buff
    while True:
        c = p[0]
        if c == 0:
            break
        if c == 45:
            sign = 1
        elif c > 47 and c < 58:
            x = x * 10 + c - 48
        p += 1
    return -x if sign else x

@cython.boundscheck(False)
@cython.wraparound(False)
def get_points_cython_numpy(filename):
    cdef int i, j, x, y, z, n_chunks
    cdef bytes line, chunk
    cdef int[:, ::1] points = np.zeros([500000, 3], np.int32)
    f = open(filename, 'rb')
    j = 0
    for line in f:
        n_chunks = int(len(line)/16)
        for i in range(n_chunks):
            chunk = line[16*i:16*(i+1)]
            x = fast_atoi(chunk[0:6])
            y = fast_atoi(chunk[6:12])
            z = fast_atoi(chunk[12:16])
            points[j, 0] = x
            points[j, 1] = y
            points[j, 2] = z
            j = j + 1

    f.close()
    return points.base[:j]


Here is the fasest method, the idea is read the whole file content into a bytes object, and get points data from it.

@cython.boundscheck(False)
@cython.wraparound(False)
cdef inline int fast_atoi(char *buf, int size):
    cdef int i=0 ,c = 0, sign = 0, x = 0
    for i in range(size):
        c = buf[i]
        if c == 0:
            break
        if c == 45:
            sign = 1
        elif c > 47 and c < 58:
            x = x * 10 + c - 48
    return -x if sign else x

@cython.boundscheck(False)
@cython.wraparound(False)
def fastest_read_points(fn):
    cdef bytes buf
    with open(fn, "rb") as f:
        buf = f.read().replace(b"\n", b"") # change it with your endline.

    cdef char * p = buf
    cdef int length = len(buf)
    cdef char * buf_end = p + length
    cdef int count = length // 16 * 2 # create enough large array  
    cdef int[:, ::1] res = np.zeros((count, 3), np.int32)
    cdef int i, j, block_count
    i = 0
    while p < buf_end:
        block_count = fast_atoi(p, 10)
        p += 10
        for j in range(block_count):
            res[i, 0] = fast_atoi(p, 6)
            res[i, 1] = fast_atoi(p+6, 6)
            res[i, 2] = fast_atoi(p+12, 4)
            p += 16
            i += 1
    return res.base[:i]

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
            
           
            
                              
                
              
              
                
                  眼角桃花        
                
              
                            
                2020-12-28 11:09
              
            
            
                                                                       
Files that are fixed format and well behaved can be read efficiently with Numpy. The idea is to read the file into an array of strings and then convert to integers in one go. The tricky bit is the handling of variable-width fields and the placement of newline characters. One way to do it for your file is:

def read_chunk_numpy(fh, n_points):
    # 16 chars per point, plus one newline character for every 5 points
    n_bytes = n_points * 16 + (n_points + 1) // 5

    txt_arr = np.fromfile(fh, 'S1', n_bytes)
    txt_arr = txt_arr[txt_arr != b'\n']    
    xyz = txt_arr.view('S6,S6,S4').astype('i,i,i')
    xyz.dtype.names = 'x', 'y', 'z'
    return xyz


Note that \n newline characters are assumed, so some more effort is needed for portability. This gave me a huge speedup compared to the plain Python loop. Test code:

import numpy as np

def write_testfile(fname, n_points):
    with open(fname, 'wb') as fh:
        for _ in range(n_points // 1000):
            n_chunk = np.random.randint(900, 1100)
            fh.write(str(n_chunk).rjust(8) + '\n')
            xyz = np.random.randint(10**4, size=(n_chunk, 3))
            for i in range(0, n_chunk, 5):
                for row in xyz[i:i+5]:
                    fh.write('%6i%6i%4i' % tuple(row))
                fh.write('\n')

def read_chunk_plain(fh, n_points):
    points = []
    count = 0
    # Use while-loop because `for line in fh` would mess with file pointer
    while True:
        line = fh.readline()
        n_chunks = int(len(line)/16)
        for i in range(n_chunks):
            chunk = line[16*i:16*(i+1)]
            x = int(chunk[0:6])
            y = int(chunk[6:12])
            z = int(chunk[12:16])
            points.append((x, y, z))

            count += 1
            if count == n_points:
                return points

def test(fname, read_chunk):
    with open(fname, 'rb') as fh:
        line = fh.readline().strip()
        while line:
            n = int(line)
            read_chunk(fh, n)
            line = fh.readline().strip()

fname = 'test.txt'
write_testfile(fname, 10**5)
%timeit test(fname, read_chunk_numpy)
%timeit test(fname, read_chunk_plain)

                                                                        
                                                        
            
            
              
                
                0
              
                 
                
               讨论(0)
              
              
                                                   
              
                                                            
            
                      
                    


               
            
    发布评论:
    
         
                        
    
    提交评论 
  
  

                    
                    
                    
                        
                        
                         加载中...
                        
                    
                
          
          	          
                             
        
        
          
            
            
              
              
            
    


                                 
              
            
                          
    

        
         
                验证码
                
                  
                
                
                   看不清?
                
              
                                  
                    
   
                 
             
              提交回复