Page Actions
Wiki Actions
User Actions
Submit This Story

Simple HTML cleaner

HTML make it clean with python…

The code

# -*- coding: utf-8 -*-
#!/usr/bin/env python
 
import sys
from optparse import OptionParser
import re
import os
 
newline = re.compile(r'[\r\n]+')
tag = re.compile(r'^\s*<(/?)(\w+).*?(/?)>$', re.DOTALL)
sw = 2
noindent = ('br',)
 
def clean(filename):
    fo = open(filename, 'rb')
    try:
        data = fo.read()
    finally:
        fo.close()
    data = newline.sub('\n', data)
    data = data.replace('>', '>\n')
    data = data.replace('<', '\n<')
    data = data.splitlines()
    new = []
    for line in data:
        line = line.strip()
        if len(line):
            new.append(line)
    data = new
    new = []
    ilevel = 0
    for line in data:
        padding = ' ' * (sw * ilevel)
        line = padding + line
        try:
            close1, tagname, close2 = tag.search(line).groups()
            if close2 or tagname.lower() in noindent:
                pass
            elif close1:
                if ilevel:
                    ilevel -=1
                    padding = ' ' * (sw * ilevel)
                    line = padding + line.strip()
            else:
                ilevel += 1
        except:
            pass
        new.append(line)
    data = '\n'.join(new) + '\n'
    sys.stdout.write(data)
 
 
def main():
    op = OptionParser()
    opts, args = op.parse_args()
    for filename in args:
        clean(filename)
 
    return 0
 
if __name__ == '__main__':
    sys.exit(main())

Discussion

Enter your comment
 
 
blog/2009/10/simple_html_cleaner.txt · Last modified: 2009/10/01 00:00 (external edit)     Back to top
Recent changes RSS feed Creative Commons License Powered by PHP Driven by DokuWiki