customize BeautifulSoup's prettify by tag

前端 未结 2 2175
梦毁少年i
梦毁少年i 2021-02-13 20:43

I was wondering if it would be possible to make it so that prettify did not create new lines on specific tags.

I would like to make it so that span

相关标签:
2条回答
  • 2021-02-13 21:07

    I'm posting a quick hack while I don't find a better solution.

    I'm actually using it on my project to avoid breaking textareas and pre tags. Replace ['span', 'a'] with the tags on which you want to prevent indentation.

    markup = """<div><div><span>a</span><span>b</span>
    <a>link</a></div><a>link1</a><a>link2</a></div>"""
    
    # Double curly brackets to avoid problems with .format()
    stripped_markup = markup.replace('{','{{').replace('}','}}')
    
    stripped_markup = BeautifulSoup(stripped_markup)
    
    unformatted_tag_list = []
    
    for i, tag in enumerate(stripped_markup.find_all(['span', 'a'])):
        unformatted_tag_list.append(str(tag))
        tag.replace_with('{' + 'unformatted_tag_list[{0}]'.format(i) + '}')
    
    pretty_markup = stripped_markup.prettify().format(unformatted_tag_list=unformatted_tag_list)
    
    print pretty_markup
    
    0 讨论(0)
  • 2021-02-13 21:07

    The short answer is no.

    The longer answer is not easily.

    I'm still using bs3, so this is a hack is for bs3. I'm partway through porting this to bs4.

    It essentially involves subclassing Tag and BeautifulSoup and overloading the prettify (and related) methods.

    Code:

    import sys
    import BeautifulSoup
    
    class Tag(BeautifulSoup.Tag):
        def __str__(self, encoding=BeautifulSoup.DEFAULT_OUTPUT_ENCODING,
                prettyPrint=False, indentLevel=0, pprint_exs=[]):
            """Returns a string or Unicode representation of this tag and
            its contents. To get Unicode, pass None for encoding.
    
            NOTE: since Python's HTML parser consumes whitespace, this
            method is not certain to reproduce the whitespace present in
            the original string."""
    
            encodedName = self.toEncoding(self.name, encoding)
    
            unflatten_here = (not self.name in pprint_exs)
    
            attrs = []
            if self.attrs:
                for key, val in self.attrs:
                    fmt = '%s="%s"'
                    if isinstance(val, basestring):
                        if self.containsSubstitutions and '%SOUP-ENCODING%' in val:
                            val = self.substituteEncoding(val, encoding)
    
                        # The attribute value either:
                        #
                        # * Contains no embedded double quotes or single quotes.
                        #   No problem: we enclose it in double quotes.
                        # * Contains embedded single quotes. No problem:
                        #   double quotes work here too.
                        # * Contains embedded double quotes. No problem:
                        #   we enclose it in single quotes.
                        # * Embeds both single _and_ double quotes. This
                        #   can't happen naturally, but it can happen if
                        #   you modify an attribute value after parsing
                        #   the document. Now we have a bit of a
                        #   problem. We solve it by enclosing the
                        #   attribute in single quotes, and escaping any
                        #   embedded single quotes to XML entities.
                        if '"' in val:
                            fmt = "%s='%s'"
                            if "'" in val:
                                # TODO: replace with apos when
                                # appropriate.
                                val = val.replace("'", "&squot;")
    
                        # Now we're okay w/r/t quotes. But the attribute
                        # value might also contain angle brackets, or
                        # ampersands that aren't part of entities. We need
                        # to escape those to XML entities too.
                        val = self.BARE_AMPERSAND_OR_BRACKET.sub(self._sub_entity, val)
    
                    attrs.append(fmt % (self.toEncoding(key, encoding),
                                        self.toEncoding(val, encoding)))
            close = ''
            closeTag = ''
            if self.isSelfClosing:
                close = ' /'
            else:
                closeTag = '</%s>' % encodedName
    
            prev = self.findPrevious(lambda x: isinstance(x, Tag))
            prev_sib = self.findPreviousSibling(lambda x: isinstance(x, Tag))
            ex_break_detected = (self.name != prev_sib.name) if(prev_sib and prev_sib.name in pprint_exs) else False
            break_detected = (self.name != prev.name) if(prev) else False
    
            indentTag, indentContents = 0, 0
            if prettyPrint:
                if(break_detected or unflatten_here):
                    indentContents = indentLevel + 1
                indentTag = indentLevel
                space = (' ' * (indentTag-1))
            contents = self.renderContents(encoding, prettyPrint, indentContents, pprint_exs, unflatten_here)
            if self.hidden:
                s = contents
            else:
                s = []
                attributeString = ''
                if attrs:
                    attributeString = ' ' + ' '.join(attrs)
                if prettyPrint and ex_break_detected and not unflatten_here:
                    s.append("\n")
                if prettyPrint and (unflatten_here or break_detected):
                    s.append(space)
                s.append('<%s%s%s>' % (encodedName, attributeString, close))
                if prettyPrint and unflatten_here:
                    s.append("\n")
                s.append(contents)
                if prettyPrint and contents and contents[-1] != "\n" and unflatten_here:
                    s.append("\n")
                if prettyPrint and closeTag and unflatten_here:
                    s.append(space)
                s.append(closeTag)
                if prettyPrint and closeTag and self.nextSibling and unflatten_here:
                    s.append("\n")
                if prettyPrint and isinstance(self.nextSibling, Tag) and self.nextSibling.name != self.name and not unflatten_here:
                    s.append("\n")
    
                s = ''.join(s)
            return s
    
        def renderContents(self, encoding=BeautifulSoup.DEFAULT_OUTPUT_ENCODING,
                           prettyPrint=False, indentLevel=0, pprint_exs=[], unflatten=True):
            """Renders the contents of this tag as a string in the given
            encoding. If encoding is None, returns a Unicode string.."""
            s=[]
            for c in self:
                text = None
                if isinstance(c, BeautifulSoup.NavigableString):
                    text = c.__str__(encoding)
                elif isinstance(c, Tag):
                    s.append(c.__str__(encoding, prettyPrint, indentLevel, pprint_exs))
                if text and prettyPrint:
                    text = text.strip()
                if text:
                    if prettyPrint and unflatten:
                        s.append(" " * (indentLevel-1))
                    s.append(text)
                    if prettyPrint and unflatten:
                        s.append("\n")
            return ''.join(s)
    BeautifulSoup.Tag = Tag
    
    class BeautifulStoneSoup(Tag, BeautifulSoup.BeautifulStoneSoup):
        pass
    BeautifulSoup.BeautifulStoneSoup = BeautifulStoneSoup
    
    class PumpkinSoup(BeautifulStoneSoup, BeautifulSoup.BeautifulSoup):
        def __init__(self, *args, **kwargs):
            self.pprint_exs = kwargs.pop("pprint_exs", [])
            super(BeautifulSoup.BeautifulSoup, self).__init__(*args, **kwargs)
        def prettify(self, encoding=BeautifulSoup.DEFAULT_OUTPUT_ENCODING):
            return self.__str__(encoding, True, pprint_exs=self.pprint_exs)
    
    doc = \
    '''
    <div>
     <div>
    <span>a</span><span>b</span>
      <a>link1</a>
      <a>link2</a>
    <span>c</span>
     </div>
    <a>link3</a><a>link4</a>
    </div>
    '''
    
    soup = PumpkinSoup(doc, pprint_exs = ["a", "span"])
    print soup.prettify()
    
    0 讨论(0)
提交回复
热议问题