How to store data like Freebase does?

后端 未结 6 405
刺人心
刺人心 2021-01-31 12:49

I admit that this is basically a duplicate question of Use freebase data on local server? but I need more detailed answers than have already been given there

I\

6条回答
  •  走了就别回头了
    2021-01-31 13:13

    And this is the extra code for my other answer. The meat is in edb.py. Run from Python console and follow the examples. Or use the web2py controller and run in your browser.

    Save this as edb.py:

    import MySQLdb
    import sys
    
    connection = MySQLdb.connect (host   = "localhost",
                                  user   = "root",
                                  passwd = "x",
                                  db     = "y")
    cursor = connection.cursor()
    
    query_counter = 0
    print_queries = False
    limit         = 1000
    
    def fetch_one( query ):
        global query_counter, print_queries
        query = query + ' LIMIT ' + str(limit)
        if print_queries:
            print query
        cursor = connection.cursor()
        cursor.execute( query )
        query_counter += 1
        result = cursor.fetchone() 
        if result:
            return result[0]
        else:
            return None
    
    def fetch_all( query ):
        global query_counter, print_queries
        query = query + ' LIMIT ' + str(limit)
        if print_queries:
            print query
        cursor = connection.cursor()
        cursor.execute( query )
        query_counter += 1
        return cursor.fetchall()
    
    def _flatten( list_of_lists ):
        import itertools
        return list(itertools.chain(*list_of_lists))
    
    #Example: e._search_by_name('steve martin')
    def _search_by_name( name, operator = '=' ):
        typed, ranked = {}, []
        if name:
            name = name.strip()
        if not name:
            return ( typed, ranked )
    
        filler = '' if operator == '=' else '%'
        ranks = {}
    
        #to filter meaningful stuff for every mid returned order by the number of types they have
    
        #search for value text if prop. is 
        #select * from ns where value = 'the king' and (property = '/m/01gr' or property = '/m/06b');
    
        name_mid  = _mid( '/type/object/name'   )
        alias_mid = _mid( '/common/topic/alias' )
    
        query = "select ns.source from ns where ns.value %s '%s%s' and ns.property in ('%s', '%s')" % ( operator, name, filler, name_mid, alias_mid )
    
        for i in fetch_all( query ):
            typed[ i[0] ] = _types( i[0] )
    
        import operator
        ranked = [ ( len( typed[i] ), i ) for i in typed ]
        ranked = [ e[1] for e in sorted( ranked, key=operator.itemgetter(0), reverse = True ) ]
    
        return (typed, ranked)
    
    
    #Example: e._children('')               <---will get the top level domains
    #         e._children('/film')          <---get all types from the domain
    #         e._children('/film/film')     <---get all properties for the type
    def _children( parent, expand = False, raw = False ):
        query = "select t.source, t.value from types t where t.destination = '%s'" % (parent)
        res = fetch_all( query )
        if raw:
            return [ row[0] for row in res ]
        if expand: prefix = parent
        else:      prefix = ''
        return [ prefix + '/' + row[1] for row in fetch_all(query) ]
    
    #Example: e._parent('/film/film/songs')
    def _parent( child ):                                       # '/people/marriage/to' -> '/people/marriage'
        #if not isinstance( child, str ): return None          # what kind of safety mechanisms do we need here?
        return '/'.join(child.split('/')[:-1])
    
    #Example: e._domains()
    def _domains():
        return _children('')
    
    #Example: e._top_level_types()
    def _top_level_types():
        return _children('/type')
    
    #TODO get all primitive types
    
    #Example: e._mid('/type/object')
    #         e._mid('/authority/imdb/name/nm0000188')
    def _mid( key ):
        if key == '':
            return None
        elif key == '/':
            key = '/boot/root_namespace'
        parts = key.split('/')
        if parts[1] == 'm':            #already a mid
            return key
        namespace = '/'.join(parts[:-1])
        key = parts[-1]
        return fetch_one( "select source from types t where t.destination = '%s' and t.value = '%s'" % (namespace, key) )
    
    #Example: e._key('/type')
    def _key( mid ):
        if isinstance( mid, str):
            res = _keys( mid )
            if not res:
                return None
            rt  = [ r for r in res if r.startswith( '/type' ) ]
            if rt:
                return rt[0]
            else:
                return res[0]
        elif isinstance( mid, list ) or isinstance( mid, tuple ):
            res = [ _key( e ) for e in mid ]
            return [ r for r in res if r is not None ]
        else:
            return None
    
    def _keys( mid ):
        # check for '/type/object/key' as well?
        query = "select t.destination, t.value from types t where t.source = '%s'" % mid
        return [ row[0]+'/'+row[1] for row in fetch_all( query ) ]
    
    #Example: e._types('/m/0p_47')
    def _types( mid ):
        tm = _mid( '/type/object/type' )
        query = "select l.destination from links l where l.source = '%s' and l.property = '%s'" % (mid, tm)
        return [ row[0] for row in fetch_all( query ) ]
    
    #Example: e._props_n('/m/0p_47')   <---Named immediate properties (like name, etc.)
    def _props_n( mid ):  #the same property can be set more than once per topic!
        query = "select ns.property from ns where ns.source = '%s'" % (mid)
        return list( set( [ row[0] for row in fetch_all( query ) ] ) )
    
    #Example: e._props_l('/m/0p_47')   <---All remote properties, some are named, some are anonymous
    def _props_l( mid ):  #the same property can be set more than once per topic!
        tm = _mid( '/type/object/type' )    #exclude types, they have tons of instance links
        res = fetch_all( "select l.property, l.destination from links l where l.source = '%s' and property <> '%s'" % (mid, tm) )
        output = {}
        for r in res:
            dests = output.get( r[0], False )
            if dests:
                dests.append( r[1] )
            else:
                output[ r[0] ] = [ r[1] ]
        return output
    
    #Example: e._props_ln('/m/0p_47')  <---All remote named properties
    def _props_ln( mid ): #named properties
        result = []
        ps = _props_l( mid )
        common_topic = _mid( '/common/topic' )
        for p in ps:
            ts = _types( ps[p][0] )
            if common_topic in ts:               #it's a common topic
                result.append( p )
        return result
    
    #Example: e._props_la('/m/0p_47')  <---All remote anonymous properties, these actually belong to the children!
    #instead of has type /common/topic we used to check if it has name
    def _props_la( mid, raw = True ): #anonymous properties (blank nodes in RDF?)
        result = []
        ps = _props_l( mid )
        common_topic = _mid( '/common/topic' )
        for p in ps:
            ts = _types( ps[p][0] )
            if common_topic not in ts:                                       #it is not a common topic
                t = _key( _types( ps[p][0] ) )
                if t and '/type/type' not in t:                              #FIXME: hack not to go into types, could be done better
                    result.append( _children( t[0], expand=True, raw=raw ) ) #get the first, is this correct?
        return _flatten( result )                                            #it is a list of lists
    
    #FIXME: try to get '/film/actor/film' -> '/type/property/expected_type' -> '/film/performance' -> properties/children 
    #instead of trying is something has name
    
    #Example: e._get_n('/m/0p_47', e._props_n('/m/0p_47')[0])['/lang/en'] <---These come with a namespace
    def _get_n( mid, prop ):  #the same property can be set more than once per topic!
        p = _mid( prop )
        query = "select ns.value from ns where ns.source = '%s' and ns.property = '%s'" % (mid, p)
        return [ r[0] for r in fetch_all( query ) ]
    
    #Example: e._get_l('/m/0p_47', e._props_l('/m/0p_47')[0])  <---returns a list of mids coresponding to that prop.
    #         e._name(e._get_l('/m/0p_47', '/film/writer/film'))          
    def _get_l( mid, prop ):  #the same property can be set more than once per topic!
        p = _mid( prop )
        query = "select l.destination from links l where l.source = '%s' and l.property = '%s'" % (mid, p)
        return  [ row[0] for row in fetch_all( query ) ]           
    
    #Example: e._name(e._get_ln('/m/0p_47', e._props_ln('/m/0p_47')[0]))
    def _get_ln( mid, p ):        #just alias for _get_l, keeping for consistency
        return _get_l( mid, p )                                 
    
    #Example: e._name(e._get_la('/m/0p_47', '/film/performance/film'))
    def _get_la( mid, prop ):
        result = []
        ps = _props_l( mid )
        for p in ps:
            es = _get_l( mid, p )                       #get the destinations
            if not es: continue
            ts = set( _types( es[0] ) )
            if _mid(_parent(_key(_mid(prop)))) in ts:   #should be able to do this more efficiently!!!
                for e in es:
                    result.append( _get_l( e, prop ) )
                return _flatten( result )               #return after the first result 
    
    #How do we determine properties with multiple values vs those with singular (i.e. place of birth)?
    #is this in the ontology?
    #Ans: yes, /type/property/unique
    
    #Example: e._all_names_ln('/m/0p_47')  <---gets all of object's remote named properties
    def _all_names_ln( mid ):
        result = {}
        for p in _props_ln( mid ):
            result[ _key(p) ] = _name( _get_ln( mid, p ) )
        return result 
    
    #Example: e._all_names_la('/m/0p_47')  <---gets all of object's remote anonymous properties
    def _all_names_la( mid ):       #TODO: prevent loops, run e.all_names_la('/m/0p_47')
        result = {}
        for p in _props_la( mid ):
            result[ _key( p ) ] = _name ( _get_la( mid, p ) )
        return result 
    
    #FIXME: _all_names_la is going into destinations which are types and have a ton of instance links...
    
    
    #Example: e._name('/m/0p_47')   <---the name of a topic
    #
    def _name( mid ):
        if isinstance( mid, str ):
            nm = _mid( '/type/object/name' )
            return _get_n( mid, nm )
        elif isinstance( mid, list ) or isinstance( mid, tuple ) or isinstance( mid, set ):
            return [ _name( e ) for e in mid ]
        else:
            return None
    
    #for internal use only
    def _get_linked( mid ):
        tm = _mid( '/type/object/type' )    #exclude types, they have tons of instance links
        query = "select destination from links where source = '%s' and property <> '%s' " % ( mid, tm )
        return set( [ r[0] for r in fetch_all( query ) ] )
    
    #for internal use only
    def _get_connections_internal( entity1, target, path, all_paths, depth, max_depth):
        import copy
    
        if depth > max_depth:
            return
    
        if True:
            print
            print str(entity1) + ', ' + str(target)
            print str( path )
            print str( all_paths )
            print depth
    
        path.append( entity1 )
    
        linked1 = _get_linked( entity1 )
    
        if target in linked1 or entity1 == target:
            path.append( target )
            all_paths.append( path )
            #print str( path )
            return
    
        for l1 in linked1:
            if l1 in path:
                continue
            _get_connections_internal( l1, 
                                       target, 
                                       copy.copy( path ),
                                       all_paths,
                                       depth+1, 
                                       max_depth )
    
    #Example: e._name(e._get_connections('/m/0p_47', '/m/0cwtm'))  <---find path in the graph between the two entities    
    def _get_connections( entity1, target ):
        result = []
        _get_connections_internal( entity1, target, [], result, 0, 2 )
        return result
    
    #for internal use only
    def _get_connections_internal2( entity1, entity2, path1, path2, all_paths, depth, max_depth, level ):
        import copy
    
        if depth > max_depth:
            return
    
        if level < 0: level = 0
    
        path1.append( entity1 )
        path2.append( entity2 )
    
        if entity1 == entity2 and level == 0:
            all_paths.append( ( path1, path2 ) ) #no need to append entity1 or entity2 to the paths
            return
    
        linked1 = _get_linked( entity1 )
        if entity2 in linked1 and entity2 not in path1 and level == 0:
            path1.append( entity2 )
            all_paths.append( ( path1, path2 ) )
            return
    
        linked2 = _get_linked( entity2 )
        if entity1 in linked2 and entity1 not in path2 and level == 0:
            path2.append( entity1 )
            all_paths.append( ( path1, path2 ) )
            return
    
        inters = linked1.intersection( linked2 )
        inters = inters.difference( set( path1 ) )
        inters = inters.difference( set( path2 ) )
    
        if inters and level == 0:
            for e in inters:               #these are many paths, have to clone
                p1 = copy.copy( path1 )
                p1.append( e )
                p2 = copy.copy( path2 )
                p2.append( e )
                all_paths.append( ( p1,p2 ) )
            return
    
        for l1 in linked1:
            if l1 in path1 or l1 in path2:
                continue
            for l2 in linked2:
                if l2 in path1 or l2 in path2:
                    continue
                _get_connections_internal2( l1,                 l2, 
                                            copy.copy( path1 ), copy.copy( path2 ),
                                            all_paths,
                                            depth+1, 
                                            max_depth,
                                            level - 1 )
    
    #Example: e._name(e._get_connections2('/m/0p_47', '/m/0cwtm'))          <---returns two meeting paths starting from both entities
    #         e._name(e._get_connections('/m/0p_47', '/m/0cwtm', level=1))  <---search deeper
    #         e._name(e._get_connections('/m/0p_47', '/m/0cwtm', level=2))  <---even deeper
    def _get_connections2( entity1, entity2, level = 0 ):
        result = []
        _get_connections_internal2( entity1, entity2, [], [], result, 0, 15, level )
        return result
    

    And here is a sample web2py controller (just copy edb.py in the web2py models directory):

    # -*- coding: utf-8 -*-
    
    def mid_to_url( mid ):
        return mid.split('/')[2]
    
    def index():
    
        form = FORM( TABLE( TR( INPUT(_name='term', _value=request.vars.term ) ),
                            TR(INPUT(_type='submit', _value='Search') ) ),
                     _method='get')
    
        typed, ranked = _search_by_name( request.vars.term )
    
        rows = []
    
        for r in ranked:
            keys = []
            for t in typed[r]:
                k = _key( t )
                if k:
                    keys.append( k )
            rows.append( TR( TD( A(_name( r ), 
                                   _href = URL('result', args = [mid_to_url(r)]))), 
                             TD( XML( '
    '.join( keys ) ) ) ) ) result = TABLE( *rows ) return { 'form': form, 'result' : result } def result(): path, data = '', '' if not request.args: return { 'path':path, 'data':data} path_rows = [] for ra in range(len(request.args)): if ra%2: arrow_url = URL( 'static', 'images/blue_arr.png' ) display_name = _key('/m/'+request.args[ra]) #it's a property else: arrow_url = URL( 'static', 'images/red_arr.png' ) display_name = _name('/m/'+request.args[ra]) #it's a topic path_rows.append( TD( A( display_name, _href=URL( args = request.args[0:ra+1] ) ) ) ) path_rows.append( TD( IMG( _src = arrow_url ) ) ) path = TABLE( *path_rows ) elems = [ '/m/'+a for a in request.args ] if _mid( '/type/property' ) in _types( elems[-1] ): #we are rendering a property objects = _get_ln( elems[-2], elems[-1] ) if not objects: #there should be a better way to see if this is anonymous objects = _get_la( elems[-2], elems[-1] ) data = TABLE( *[ TR( TD( A(_name(o), _href = URL( args = request.args+[mid_to_url(o)])))) for o in objects ] ) else: #we are rendering a topic direct_props = TABLE(*[TR(TD(_key(p)), TD(', '.join(_get_n( elems[-1], p)))) for p in _props_n( elems[-1] )]) linked_named_props = TABLE(*[TR(TD(A(_key(p), _href = URL(args = request.args+[mid_to_url(p)])))) for p in _props_ln( elems[-1] ) ] ) linked_anon_props = TABLE(*[TR(TD(A(_key(p), _href = URL(args = request.args+[mid_to_url(p)])))) for p in _props_la( elems[-1] ) ] ) data = TABLE( TR( TH( 'Linked named data:'), TH( 'Linked anonymous data:' ), TH( 'Direct data:' ) ), TR( TD( linked_named_props ), TD( linked_anon_props ), TD( direct_props ) ) ) return { 'path': path, 'data':data }

提交回复
热议问题