Python: How to update value of key value pair in nested dictionary?

后端未结

关注

 9  977

广开言路 2021-01-21 20:25

i am trying to make an inversed document index, therefore i need to know from all unique words in a collection in which doc they occur and how often.

i have used this an

9条回答

伪装坚强ぢ (楼主)

2021-01-21 21:28

#!/usr/bin/env python
# encoding: utf-8
from os.path import join
from glob import glob as glob_
from collections import defaultdict, Counter
from string import punctuation

WORKDIR  = 'temp/'
FILETYPE = '*.html'
OUTF     = 'doc_{0}'.format

def extract(text, startTag='', endTag=''):
    """Extract text between start tag and end tag

    Start at first char following first occurrence of startTag
      If none, begin at start of text
    End at last char preceding first subsequent occurrence of endTag
      If none, end at end of text
    """
    return text.split(startTag, 1)[-1].split(endTag, 1)[0]    

def main():
    DocWords = defaultdict(dict)

    infnames = glob_(join(WORKDIR, FILETYPE))
    for docId,infname in enumerate(infnames, 1):
        outfname = OUTF(docId)
        with open(infname) as inf:
            text = inf.read().lower()
        words = extract(text).strip(punctuation).split()
        for wd,num in Counter(words).iteritems():
            DocWords[wd][outfname] = num

if __name__ == '__main__':
    main()

0 讨论(0)

查看其它9个回答