Simple HTML cleaner
HTML make it clean with python…
The code
# -*- coding: utf-8 -*- #!/usr/bin/env python import sys from optparse import OptionParser import re import os newline = re.compile(r'[\r\n]+') tag = re.compile(r'^\s*<(/?)(\w+).*?(/?)>$', re.DOTALL) sw = 2 noindent = ('br',) def clean(filename): fo = open(filename, 'rb') try: data = fo.read() finally: fo.close() data = newline.sub('\n', data) data = data.replace('>', '>\n') data = data.replace('<', '\n<') data = data.splitlines() new = [] for line in data: line = line.strip() if len(line): new.append(line) data = new new = [] ilevel = 0 for line in data: padding = ' ' * (sw * ilevel) line = padding + line try: close1, tagname, close2 = tag.search(line).groups() if close2 or tagname.lower() in noindent: pass elif close1: if ilevel: ilevel -=1 padding = ' ' * (sw * ilevel) line = padding + line.strip() else: ilevel += 1 except: pass new.append(line) data = '\n'.join(new) + '\n' sys.stdout.write(data) def main(): op = OptionParser() opts, args = op.parse_args() for filename in args: clean(filename) return 0 if __name__ == '__main__': sys.exit(main())

Discussion