问题
This morning I asked here why my Python code was (a lot) slower then my F# version but I'm wondering whether the F# version can be made faster. Any ideas how I could create a faster version of the below code that reads a sorted list of unique indexes from a binary file with 32-bit integers? Note that I tried 2 approaches, one based on a BinaryReader, the other one based on MemoryMappedFile (and some more on Github).
module SimpleRead
let readValue (reader:BinaryReader) cellIndex =
// set stream to correct location
reader.BaseStream.Position <- cellIndex*4L
match reader.ReadInt32() with
| Int32.MinValue -> None
| v -> Some(v)
let readValues fileName indices =
use reader = new BinaryReader(File.Open(fileName, FileMode.Open, FileAccess.Read, FileShare.Read))
// Use list or array to force creation of values (otherwise reader gets disposed before the values are read)
let values = List.map (readValue reader) (List.ofSeq indices)
values
module MemoryMappedSimpleRead =
open System.IO.MemoryMappedFiles
let readValue (reader:MemoryMappedViewAccessor) offset cellIndex =
let position = (cellIndex*4L) - offset
match reader.ReadInt32(position) with
| Int32.MinValue -> None
| v -> Some(v)
let readValues fileName indices =
use mmf = MemoryMappedFile.CreateFromFile(fileName, FileMode.Open)
let offset = (Seq.min indices ) * 4L
let last = (Seq.max indices) * 4L
let length = 4L+last-offset
use reader = mmf.CreateViewAccessor(offset, length, MemoryMappedFileAccess.Read)
let values = (List.ofSeq indices) |> List.map (readValue reader offset)
values
For comparison here is my latest numpy version
import numpy as np
def convert(v):
if v <> -2147483648:
return v
else:
return None
def read_values(filename, indices):
values_arr = np.memmap(filename, dtype='int32', mode='r')
return map(convert, values_arr[indices])
Update In contrary to what I said before here, my python is still a lot slower then the F# version but due to an error in my python tests it appeared otherwise. Leaving this question here in case someone with in depth knowledge of the BinaryReader or MemoryMappedFile knows some improvements.
回答1:
I managed to get the SimpleReader 30% faster by using reader.BaseStream.Seek instead of reader.BaseStream.Position. I also replaced lists by arrays but this didn't change a lot.
The full code of my simple reader is now:
open System
open System.IO
let readValue (reader:BinaryReader) cellIndex =
// set stream to correct location
reader.BaseStream.Seek(int64 (cellIndex*4), SeekOrigin.Begin) |> ignore
match reader.ReadInt32() with
| Int32.MinValue -> None
| v -> Some(v)
let readValues indices fileName =
use reader = new BinaryReader(File.Open(fileName, FileMode.Open, FileAccess.Read, FileShare.Read))
// Use list or array to force creation of values (otherwise reader gets disposed before the values are read)
let values = Array.map (readValue reader) indices
values
The full code and versions in other languages are on GitHub.
来源:https://stackoverflow.com/questions/24386844/performance-issue-with-reading-integers-from-a-binary-file-at-specific-locations