chriswarbo-net: c39c4d201b0239ab5ca93cd32de9a50bef207ff5

     1: #!/usr/bin/env python3
     2: 
     3: from bs4 import BeautifulSoup
     4: import sys
     5: 
     6: msg = lambda s: sys.stderr.write(s + "\n")
     7: 
     8: data = sys.stdin.read()
     9: doc  = BeautifulSoup(data, 'html.parser')
    10: 
    11: # We used to give tables summaries, specified by a sibling with class "summary"
    12: # html-tidy now complains about this, so just remove them
    13: for summarise in doc.find_all(class_='summarise'):
    14:     for summary in summarise.find_all(class_='summary'):
    15:         highest = summary
    16:         drop    = True
    17:         for parent in summary.parents:
    18:             if drop:
    19:                 if parent == summarise:
    20:                     drop = False
    21:                 else:
    22:                     highest = parent
    23:         highest.extract()
    24: 
    25: # Empty <code> elements are left over from running commands with PanPipe.
    26: # Tidy complains about them, so we might as well remove them.
    27: # Remove <code> elements first, then remove any containers (<span>, <p> and
    28: # <pre> elements) which might have contained those <code> elements and are
    29: # subsequently empty.
    30: for tag in ['code', 'pre', 'span', 'p']:
    31:     for e in doc.find_all(tag):
    32:         e.extract() if repr(e) == '<' + tag + '></' + tag + '>' \
    33:         else None
    34: 
    35: # When we put multiple lines in a document's 'extra_head' (e.g. script tags),
    36: # Pandoc can put '¶' and '<br />' in the <head>, which Tidy spots. So we
    37: # strip them out here.
    38: import re
    39: for header in doc.find_all('head'):
    40:     odd_paras = header.find_all(string = re.compile('¶'))
    41:     for odd_para in odd_paras:
    42:         odd_para.replace_with(odd_para.replace('¶', ''))
    43:     for br in header.find_all('br'):
    44:         br.extract()
    45: 
    46:     # While we're here, move any rogue <style> elements to the header
    47:     for style in doc.find_all('style'):
    48:         in_header = False
    49:         for parent in style.parents:
    50:             if parent == header:
    51:                 in_header = True
    52:         if not in_header:
    53:             header.append(style)
    54: 
    55: # Tidy doesn't like the ol element's type="1" attribute in XHTML5; remove it,
    56: # since that's the default style anyway
    57: for ol in doc.find_all('ol'):
    58:     if 'type' in ol.attrs and ol['type'] == u'1':
    59:         del(ol['type'])
    60: 
    61: print(str(doc))

Generated by git2html.