Optimize String Parsing

余生长醉 提交于 2019-12-05 02:02:35
tofi9

Have you considered using pull-style reads & recursive processing? That would eliminate reading the whole file into memory and also eliminate managing some own stack to keep track how deep you're parsing.

Below an example in Swift. The example works with your sample "txf", but not with the dropbox version; some of your "members" span over multiple lines. If this is a requirement, it can easily be implemented into switch/case "$" section. However, I don't see your own code handling this either. Also, the example doesn't follow the correct Swift error handling yet (the parse method would need an additional NSError parameter)

import Foundation

extension String
{
    public func indexOfCharacter(char: Character) -> Int? {
        if let idx = find(self, char) {
            return distance(self.startIndex, idx)
        }
        return nil
    }

    func substringToIndex(index:Int) -> String {
        return self.substringToIndex(advance(self.startIndex, index))
    }
    func substringFromIndex(index:Int) -> String {
        return self.substringFromIndex(advance(self.startIndex, index))
    }
}


func parse(aStreamReader:StreamReader, parentTagName:String) -> Dictionary<String,AnyObject> {
    var dict = Dictionary<String,AnyObject>()

    while let line = aStreamReader.nextLine() {

        let firstChar = first(line)
        let theRest = dropFirst(line)

        switch firstChar! {
        case "$":
            if let idx = theRest.indexOfCharacter("=") {
                let key = theRest.substringToIndex(idx)
                let value = theRest.substringFromIndex(idx+1)

                dict[key] = value
            } else {
                println("no = sign")
            }
        case "#":
            let subDict = parse(aStreamReader,theRest)

            var list = dict[theRest] as? [Dictionary<String,AnyObject>]
            if list == nil {
                dict[theRest] = [subDict]
            } else {
                list!.append(subDict)
            }
        case "/":
            if theRest != parentTagName {
                println("mismatch... [\(theRest)] != [\(parentTagName)]")
            } else {
                return dict
            }
        default:
            println("mismatch... [\(line)]")
        }
    }

    println("shouldn't be here...")
    return dict
}


var data : Dictionary<String,AnyObject>?

if let aStreamReader = StreamReader(path: "/Users/taoufik/Desktop/QuickParser/QuickParser/file.txf") {

    if var line = aStreamReader.nextLine() {
        let tagName = line.substringFromIndex(advance(line.startIndex, 1))
        data = parse(aStreamReader, tagName)
    }

    aStreamReader.close()
}

println(JSON(data!))

And the StreamReader was borrowed from https://stackoverflow.com/a/24648951/95976

Edit

Edit 2

I rewrote the above in C++11 and got it to run in less than 0.05 seconds (release mode) on a 2012 MBA I5 using the updated file on dropbox. I suspect NSDictionary and NSArray must have some penalty. The code below can be compiled into an objective-c project (file needs have extension .mm):

#include <iostream>
#include <sstream>
#include <string>
#include <fstream>
#include <map>
#include <vector>

using namespace std;


class benchmark {

private:
    typedef std::chrono::high_resolution_clock clock;
    typedef std::chrono::milliseconds milliseconds;

    clock::time_point start;

public:
    benchmark(bool startCounting = true) {
        if(startCounting)
            start = clock::now();
    }

    void reset() {
        start = clock::now();
    }

    double elapsed() {
        milliseconds ms = std::chrono::duration_cast<milliseconds>(clock::now() - start);
        double elapsed_secs = ms.count() / 1000.0;
        return elapsed_secs;
    }
};

struct obj {
    map<string,string> properties;
    map<string,vector<obj>> subObjects;
};

obj parse(ifstream& stream, string& parentTagName) {
    obj obj;
    string line;
    while (getline(stream, line))
    {
        auto firstChar = line[0];
        auto rest = line.substr(1);

        switch (firstChar) {
            case '$': {
                auto idx = rest.find_first_of('=');

                if (idx == -1) {
                    ostringstream o;
                    o << "no = sign: " << line;
                    throw o.str();
                }
                auto key = rest.substr(0,idx);
                auto value = rest.substr(idx+1);
                obj.properties[key] = value;
                break;
            }
            case '#': {
                auto subObj = parse(stream, rest);
                obj.subObjects[rest].push_back(subObj);
                break;
            }
            case '/':
                if(rest != parentTagName) {
                    ostringstream o;
                    o << "mismatch end of object " << rest << " != " << parentTagName;
                    throw o.str();
                } else {
                    return obj;
                }
                break;
            default:
                ostringstream o;
                o << "mismatch line " << line;
                throw o.str();
                break;
        }

    }

    throw "I don't know why I'm here. Probably because the file is missing an end of object marker";
}


void visualise(obj& obj, int indent = 0) {
    for(auto& property : obj.properties) {
        cout << string(indent, '\t') << property.first << " = " << property.second << endl;
    }

    for(auto& subObjects : obj.subObjects) {
        for(auto& subObject : subObjects.second) {
            cout << string(indent, '\t') << subObjects.first << ": " << endl;
            visualise(subObject, indent + 1);
        }
    }
}

int main(int argc, const char * argv[]) {
    try {
        obj result;

        benchmark b;
        ifstream stream("/Users/taoufik/Desktop/QuickParser/QuickParser/Members.txf");
        string line;
        if (getline(stream, line))
        {
            string tagName = line.substr(1);
            result = parse(stream, tagName);
        }

        cout << "elapsed " << b.elapsed() <<  " ms" << endl;

        visualise(result);

    }catch(string s) {
        cout << "error " << s;
    }

    return 0;
}

Edit 3

See link for full code C++: https://github.com/tofi9/TxfParser

I did some work on your github source - with following 2 changes I got overal improvement of 30% though the major improvement is from "Optimisation 1"

Optimisation 1 - based on your data came with with following work.

+ (int)locate:(NSString*)inString check:(unichar) identifier
{
    int ret = -1;
    for (int i = 0 ; i < inString.length; i++){
        if (identifier == [inString characterAtIndex:i]) {
            ret = i;
            break;
        }

    }

    return ret;
}

- (void)didFindKeyValuePair:(NSString *)tag{
#if 0
    NSArray *components = [tag componentsSeparatedByString:@"="];
    NSString *key = [components firstObject];
    NSString *value = [components lastObject];
#else

    int locate = [TXFParser locate:tag check:'='];

    NSString *key = [tag substringToIndex:locate];
    NSString *value = [tag substringFromIndex:locate+1];

#endif
    if (key.length) {
        self.dict[key] = value?:@"";
    }
}

Optimisation 2:

- (id)objectFromString:(NSString *)txfString{
    [txfString enumerateLinesUsingBlock:^(NSString *string, BOOL *stop) {
#if 0
        if ([string hasPrefix:@"#"]) {
            [self didStartParsingTag:[string substringFromIndex:1]];
        }else if([string hasPrefix:@"$"]){
            [self didFindKeyValuePair:[string substringFromIndex:1]];
        }else if([string hasPrefix:@"/"]){
            [self didEndParsingTag:[string substringFromIndex:1]];
        }else{
            //[self didFindBodyValue:string];
        }
#else
        unichar identifier = ([string length]>0)?[string characterAtIndex:0]:0;
        if (identifier == '#') {
            [self didStartParsingTag:[string substringFromIndex:1]];
        }else if(identifier == '$'){
            [self didFindKeyValuePair:[string substringFromIndex:1]];
        }else if(identifier == '/'){
            [self didEndParsingTag:[string substringFromIndex:1]];
        }else{
            //[self didFindBodyValue:string];
        }

#endif
    }]; return self.dict;
}

Hope it helps you.

易学教程内所有资源均来自网络或用户发布的内容,如有违反法律规定的内容欢迎反馈
该文章没有解决你所遇到的问题?点击提问,说说你的问题,让更多的人一起探讨吧!