[英]How do I split HTML text in two using Python?
我想将 HTML 文本的部分较长的段落拆分成较短的段落,以便放在一张幻灯片上。 这些段落可能是完整的 web 页面,或者只是一些带有 HTML 格式标签的文本。 我想保留 HTML 结构,以便 output 段落保留其格式。 这是一个更大的 Python 项目的一部分。
注意:我已经找到了解决方案,我将在下面发布。 我只是想与 Stack Overflow 社区分享它,希望它对其他人有用。
这是我的解决方案:
from bs4 import BeautifulSoup, Tag, NavigableString
from bs4.element import Comment
import copy
import re
INVISIBLE_ELEMS = ('style', 'script', 'head', 'title')
class SplitPea():
def __init__(self, html_or_soup):
try:
html_or_soup = BeautifulSoup(html_or_soup, parser='html_parser')
except FeatureNotFound: # Argument already is a bs4 element (probably)
pass
self.original_soup = html_or_soup
self.original_visible_text = visible_text(self.original_soup)
self.original_visible_stats = vis_nav_stats(self.original_soup, 0, 0)
def clone(el, class_constructor=Tag):
''' Modified from original by Martijn Pieters @ Stack Overflow
https://stackoverflow.com/questions/23057631/clone-element-with-beautifulsoup
'''
if isinstance(el, NavigableString):
return type(el)(el)
copy = class_constructor(None, el.builder, el.name, el.namespace, el.nsprefix)
# work around bug where there is no builder set
# https://bugs.launchpad.net/beautifulsoup/+bug/1307471
copy.attrs = dict(el.attrs)
for attr in ('can_be_empty_element', 'hidden'):
setattr(copy, attr, getattr(el, attr))
for child in el.contents:
copy.append(clone(child))
return copy
def len_map(element, level=0):
''' Prints a "map" of the tags in the element
'''
if hasattr(element, 'contents'):
print(f"{''.join(level*['*'])} {len(visible_text(element))}")
for child in element.children:
len_map(child, level+1)
else:
print(f"{''.join(level*['*'])} {len(element) if element.parent.name
not in INVISIBLE_ELEMS else 0}")
def visible_text(soup):
''' get visible text from a document
Slightly modified from original by polor beer @ Stack Overflow
https://stackoverflow.com/questions/1936466/beautifulsoup-grab-visible-webpage-text#1983219
'''
text = ''.join([
s for s in soup.strings
if s.parent.name not in INVISIBLE_ELEMS
])
return text
def traverse_tree(element, tag_action, vis_nav_str_action=None,
invis_nav_str_action=None, level=0, **kwargs):
'''Recursively iterates through the bs4 data structure,
performing actions depending on the type of elements found
'''
if not vis_nav_str_action:
vis_nav_str_action = tag_action
if not vis_nav_str_action:
invis_nav_str_action = vis_nav_str_action
if hasattr(element, 'contents'):
tag_action(element, level)
for child in element.children:
traverse_tree(child, tag_action, vis_nav_str_action,
invis_nav_str_action, level+1, **kwargs)
else:
if element.parent.name not in INVISIBLE_ELEMS:
vis_nav_str_action(element, level, **kwargs)
else:
invis_nav_str_action(element, level, **kwargs)
def report_tag(element, level):
print(f"{''.join(level*['*'])} {len(visible_text(element))} <{element.name}> "
"{visible_text(element)}")
def report_vis_nav_tag(element, level):
print(f"{''.join(level*['*'])} {len(element)} {element}")
def report_invis_nav_tag(element, level):
print(f"{''.join(level*['*'])} 0 {element}")
def vis_nav_stats(element, level, prior_char):
''' Returns a data structure with stats for all of the elements in the
tree. The structure returned mimics the structure of the bs4 element
'''
if hasattr(element, 'contents'):
next_level_list = []
next_level_prior_char = prior_char
for child in element.children:
next_level_list.append(vis_nav_stats(child, level+1,
prior_char=next_level_prior_char))
next_level_prior_char += next_level_list[-1]['length']
return {'name': element.name, 'level': level, 'start': prior_char,
'length': len(visible_text(element)), 'children': next_level_list}
else:
if element.parent.name not in INVISIBLE_ELEMS:
return {'name': None, 'level': level, 'start': prior_char,
'length': len(element), 'children': None}
else:
return {'name': None, 'level': level, 'start': prior_char,
'length': 0, 'children': None}
def split_element(element, stats, left_len):
''' Splits a BeautifulSoup4 element in two. The position of the split
(left_len) is length of the left output element's visible characters.
'''
if hasattr(element, 'contents'): # a tag
if element.name in INVISIBLE_ELEMS: # copied whole into left and right
left = clone(element)
right = clone(element)
else: # Visible elements get split between left and right
if stats['start'] <= len_left: # Part or all of element in left
if stats['start'] + stats['length'] <= len_left: # All in left
left = clone(element)
right = None
else: # Part in left, part in right
left = Tag(None, element.builder,
element.name, element.namespace,
element.nsprefix)
left.attrs = dict(element.attrs)
right = Tag(None, element.builder,
element.name, element.namespace,
element.nsprefix)
right.attrs = dict(element.attrs)
for attr in ('can_be_empty_element', 'hidden'):
setattr(left, attr, getattr(element, attr))
setattr(right, attr, getattr(element, attr))
for child_element, child_stats in zip(element.children, stats['children']):
child_left, child_right = split_element(child_element, child_stats, left_len)
if child_left:
left.append(child_left)
if child_right:
right.append(child_right)
else: # All in right
left = None
right = clone(element)
else: # a navigable string
if stats['start'] <= len_left: # Part or all of element in left
if stats['start'] + stats['length'] <= len_left: # All in left
left = type(element)(element)
right = None
else: # Part in left, part in right
local_len_left = len_left - stats['start']
left = type(element)(element[:local_len_left])
right = type(element)(element[local_len_left:])
else: # All in right
left = None
right = type(element)(element)
return left, right
示例用法:
text = '<html><head><title>This is the title</title></head><body><p>This is the first ' \
'sentence containing <b>bold</b> text.</p><br/><p>This is the second ' \
'sentence containing <i>italic</i> text.</p></body></html>'
splitter = SplitPea(text)
element = splitter.original_soup
stats = splitter.original_visible_stats
len_left = 40
left, right = split_element(element, stats, len_left)
print(f"left = {left}")
print(f"right = {right}")
生产 output:
left = <html><head><title>This is the title</title></head><body><p>This is the first
sentence containing <b>bo</b></p></body></html>
right = <html><head><title>This is the title</title></head><body><p><b>ld</b> text.</p>
<br/><p>This is the second sentence containing <i>italic</i> text.</p></body></html>
声明:本站的技术帖子网页,遵循CC BY-SA 4.0协议,如果您需要转载,请注明本站网址或者原文地址。任何问题请咨询:yoyou2525@163.com.