Get the html under a tag using htmlparser python

后端 未结 2 1504
隐瞒了意图╮
隐瞒了意图╮ 2021-01-13 11:59

I want to get whole html under a tag and using HTMLParser. I am able to currently get the data between the tags and following is my code

class LinksParser(HT         


        
相关标签:
2条回答
  • 2021-01-13 12:38

    One could use xml.etree.ElementTree.TreeBuilder to exploit etree API for finding/manipulating the <span> element:

    import sys
    from HTMLParser import HTMLParser
    from xml.etree import cElementTree as etree
    
    class LinksParser(HTMLParser):
      def __init__(self):
          HTMLParser.__init__(self)
          self.tb = etree.TreeBuilder()
    
      def handle_starttag(self, tag, attributes):
          self.tb.start(tag, dict(attributes))
    
      def handle_endtag(self, tag):
          self.tb.end(tag)
    
      def handle_data(self, data):
          self.tb.data(data)
    
      def close(self):
          HTMLParser.close(self)
          return self.tb.close()
    
    parser = LinksParser()
    parser.feed(sys.stdin.read())
    root = parser.close()
    span = root.find(".//span[@itemprop='description']")
    etree.ElementTree(span).write(sys.stdout)
    

    Output

    <span itemprop="description">
    <h1>My First Heading</h1>
    <p>My first <br /><br />paragraph.</p>
    </span>
    

    To print without the parent (root) <span> tag:

    sys.stdout.write(span.text)
    for child in span:
        sys.stdout.write(etree.tostring(child)) # add encoding="unicode" on Python 3
    
    0 讨论(0)
  • 2021-01-13 12:41

    Here's something that gets the job done based on the test data you provided with minimal changes to your existing code (assuming it's basically doing what you want already). You'd probably want to expand it to deal with self-closing tags in a more robust way:

    from HTMLParser import HTMLParser
    
    class LinksParser(HTMLParser):
        def __init__(self):
            HTMLParser.__init__(self)
            self.recording = 0
            self.data = ''
            self.self_closing_tags = ("br",)
    
        def handle_starttag(self, tag, attributes):
            if tag not in ('span',) + self.self_closing_tags:
                self.data += "<%s" % (tag,)
                if attributes:
                    self.data += " " + " ".join('%s="%s"' % (k, v) for k, v in attributes)
                self.data += ">"
                return
            if self.recording:
                self.recording += 1
                return
            for name, value in attributes:
                if name == 'itemprop' and value == 'description':
                    break
            else:
                return
            self.recording = 1 
    
        def handle_endtag(self, tag):
            if tag == 'span' and self.recording:
                 self.recording -= 1
            elif tag in self.self_closing_tags:
                 self.data += "<%s/"> % (tag,)
            else:
                 self.data += "</%s>" % (tag,)
    
        def handle_data(self, data):
            if self.recording:
                self.data += data
    

    Given this as input:

    <span itemprop="description">
    <h1>My First Heading</h1>
    <p>My first <br/><br/>paragraph.</p>
    </span>
    

    the output is:

    <h1>My First Heading</h1>
    <p>My first <br/><br/>paragraph.</p>
    
    0 讨论(0)
提交回复
热议问题