chriswarbo-net: c39c4d201b0239ab5ca93cd32de9a50bef207ff5
1: #!/usr/bin/env python3
2:
3: from bs4 import BeautifulSoup
4: import sys
5:
6: msg = lambda s: sys.stderr.write(s + "\n")
7:
8: data = sys.stdin.read()
9: doc = BeautifulSoup(data, 'html.parser')
10:
11: # We used to give tables summaries, specified by a sibling with class "summary"
12: # html-tidy now complains about this, so just remove them
13: for summarise in doc.find_all(class_='summarise'):
14: for summary in summarise.find_all(class_='summary'):
15: highest = summary
16: drop = True
17: for parent in summary.parents:
18: if drop:
19: if parent == summarise:
20: drop = False
21: else:
22: highest = parent
23: highest.extract()
24:
25: # Empty <code> elements are left over from running commands with PanPipe.
26: # Tidy complains about them, so we might as well remove them.
27: # Remove <code> elements first, then remove any containers (<span>, <p> and
28: # <pre> elements) which might have contained those <code> elements and are
29: # subsequently empty.
30: for tag in ['code', 'pre', 'span', 'p']:
31: for e in doc.find_all(tag):
32: e.extract() if repr(e) == '<' + tag + '></' + tag + '>' \
33: else None
34:
35: # When we put multiple lines in a document's 'extra_head' (e.g. script tags),
36: # Pandoc can put '¶' and '<br />' in the <head>, which Tidy spots. So we
37: # strip them out here.
38: import re
39: for header in doc.find_all('head'):
40: odd_paras = header.find_all(string = re.compile('¶'))
41: for odd_para in odd_paras:
42: odd_para.replace_with(odd_para.replace('¶', ''))
43: for br in header.find_all('br'):
44: br.extract()
45:
46: # While we're here, move any rogue <style> elements to the header
47: for style in doc.find_all('style'):
48: in_header = False
49: for parent in style.parents:
50: if parent == header:
51: in_header = True
52: if not in_header:
53: header.append(style)
54:
55: # Tidy doesn't like the ol element's type="1" attribute in XHTML5; remove it,
56: # since that's the default style anyway
57: for ol in doc.find_all('ol'):
58: if 'type' in ol.attrs and ol['type'] == u'1':
59: del(ol['type'])
60:
61: print(str(doc))
Generated by git2html.