簡體   English   中英

如何使用 Python 將 HTML 文本一分為二?

[英]How do I split HTML text in two using Python?

我想將 HTML 文本的部分較長的段落拆分成較短的段落,以便放在一張幻燈片上。 這些段落可能是完整的 web 頁面,或者只是一些帶有 HTML 格式標簽的文本。 我想保留 HTML 結構,以便 output 段落保留其格式。 這是一個更大的 Python 項目的一部分。

注意:我已經找到了解決方案,我將在下面發布。 我只是想與 Stack Overflow 社區分享它,希望它對其他人有用。

這是我的解決方案:

from bs4 import BeautifulSoup, Tag, NavigableString
from bs4.element import Comment
import copy
import re


INVISIBLE_ELEMS = ('style', 'script', 'head', 'title')

class SplitPea():

    def __init__(self, html_or_soup):
        try:
            html_or_soup = BeautifulSoup(html_or_soup, parser='html_parser')
        except FeatureNotFound: # Argument already is a bs4 element (probably)
            pass
        self.original_soup = html_or_soup
        self.original_visible_text = visible_text(self.original_soup)
        self.original_visible_stats = vis_nav_stats(self.original_soup, 0, 0)


def clone(el, class_constructor=Tag):
    ''' Modified from original by Martijn Pieters @ Stack Overflow
        https://stackoverflow.com/questions/23057631/clone-element-with-beautifulsoup
    '''
    if isinstance(el, NavigableString):
        return type(el)(el)

    copy = class_constructor(None, el.builder, el.name, el.namespace, el.nsprefix)
    # work around bug where there is no builder set
    # https://bugs.launchpad.net/beautifulsoup/+bug/1307471
    copy.attrs = dict(el.attrs)
    for attr in ('can_be_empty_element', 'hidden'):
        setattr(copy, attr, getattr(el, attr))
    for child in el.contents:
        copy.append(clone(child))
    return copy


def len_map(element, level=0):
    ''' Prints a "map" of the tags in the element
    '''
    if hasattr(element, 'contents'):
        print(f"{''.join(level*['*'])} {len(visible_text(element))}")
        for child in element.children:
            len_map(child, level+1)
    else:
        print(f"{''.join(level*['*'])} {len(element) if element.parent.name
              not in INVISIBLE_ELEMS else 0}")


def visible_text(soup):
    ''' get visible text from a document
        Slightly modified from original by polor beer @ Stack Overflow
        https://stackoverflow.com/questions/1936466/beautifulsoup-grab-visible-webpage-text#1983219
    '''
    text = ''.join([
        s for s in soup.strings
        if s.parent.name not in INVISIBLE_ELEMS
    ])
    return text


def traverse_tree(element, tag_action, vis_nav_str_action=None,
                  invis_nav_str_action=None, level=0, **kwargs):
    '''Recursively iterates through the bs4 data structure,
        performing actions depending on the type of elements found
    '''
    if not vis_nav_str_action:
        vis_nav_str_action = tag_action
    if not vis_nav_str_action:
        invis_nav_str_action = vis_nav_str_action

    if hasattr(element, 'contents'):
        tag_action(element, level)
        for child in element.children:
            traverse_tree(child, tag_action, vis_nav_str_action,
                          invis_nav_str_action, level+1, **kwargs)
    else:
        if element.parent.name not in INVISIBLE_ELEMS:
            vis_nav_str_action(element, level, **kwargs)
        else:
            invis_nav_str_action(element, level, **kwargs)


def report_tag(element, level):
    print(f"{''.join(level*['*'])} {len(visible_text(element))} <{element.name}> "
          "{visible_text(element)}")


def report_vis_nav_tag(element, level):
    print(f"{''.join(level*['*'])} {len(element)} {element}")


def report_invis_nav_tag(element, level):
    print(f"{''.join(level*['*'])} 0 {element}")


def vis_nav_stats(element, level, prior_char):
    ''' Returns a data structure with stats for all of the elements in the
        tree. The structure returned mimics the structure of the bs4 element
    '''
    if hasattr(element, 'contents'):
        next_level_list = []
        next_level_prior_char = prior_char
        for child in element.children:
            next_level_list.append(vis_nav_stats(child, level+1,
                                                 prior_char=next_level_prior_char))
            next_level_prior_char += next_level_list[-1]['length']
        return {'name': element.name, 'level': level, 'start': prior_char,
                'length': len(visible_text(element)), 'children': next_level_list}
    else:
        if element.parent.name not in INVISIBLE_ELEMS:
            return {'name': None, 'level': level, 'start': prior_char,
                    'length': len(element), 'children': None}
        else:
            return {'name': None, 'level': level, 'start': prior_char,
                    'length': 0, 'children': None}



def split_element(element, stats, left_len):
    ''' Splits a BeautifulSoup4 element in two. The position of the split
        (left_len) is length of the left output element's visible characters. 
    '''
    if hasattr(element, 'contents'): # a tag
        if element.name in INVISIBLE_ELEMS: # copied whole into left and right
            left = clone(element)
            right = clone(element)
        else: # Visible elements get split between left and right
            if stats['start'] <= len_left: # Part or all of element in left
                if stats['start'] + stats['length'] <= len_left: # All in left
                    left = clone(element)
                    right = None
                else: # Part in left, part in right
                    left = Tag(None, element.builder,
                                             element.name, element.namespace,
                                             element.nsprefix)
                    left.attrs = dict(element.attrs)
                    right = Tag(None, element.builder,
                                             element.name, element.namespace,
                                             element.nsprefix)
                    right.attrs = dict(element.attrs)
                    for attr in ('can_be_empty_element', 'hidden'):
                        setattr(left, attr, getattr(element, attr))
                        setattr(right, attr, getattr(element, attr))
                    for child_element, child_stats in zip(element.children, stats['children']):
                        child_left, child_right = split_element(child_element, child_stats, left_len)
                        if child_left:
                            left.append(child_left)
                        if child_right:
                            right.append(child_right)
            else: # All in right
                left = None
                right = clone(element)
    else: # a navigable string
        if stats['start'] <= len_left: # Part or all of element in left
            if stats['start'] + stats['length'] <= len_left: # All in left
                left = type(element)(element)
                right = None
            else: # Part in left, part in right
                local_len_left = len_left - stats['start']
                left = type(element)(element[:local_len_left])
                right = type(element)(element[local_len_left:])
        else: # All in right
            left = None
            right = type(element)(element)
    return left, right

示例用法:

text = '<html><head><title>This is the title</title></head><body><p>This is the first ' \
    'sentence containing <b>bold</b> text.</p><br/><p>This is the second ' \
    'sentence containing <i>italic</i> text.</p></body></html>'
splitter = SplitPea(text)

element = splitter.original_soup
stats = splitter.original_visible_stats
len_left = 40
left, right = split_element(element, stats, len_left)
print(f"left = {left}")
print(f"right = {right}")

生產 output:

left = <html><head><title>This is the title</title></head><body><p>This is the first 
sentence containing <b>bo</b></p></body></html>
right = <html><head><title>This is the title</title></head><body><p><b>ld</b> text.</p>
<br/><p>This is the second sentence containing <i>italic</i> text.</p></body></html>

暫無
暫無

聲明:本站的技術帖子網頁,遵循CC BY-SA 4.0協議,如果您需要轉載,請注明本站網址或者原文地址。任何問題請咨詢:yoyou2525@163.com.

 
粵ICP備18138465號  © 2020-2024 STACKOOM.COM