Package cherrypy :: Package lib :: Module tidy
[hide private]
[frames] | no frames]

Source Code for Module cherrypy.lib.tidy

  1  """Functions to run cherrypy.response through Tidy or NSGML.""" 
  2   
  3  import cgi 
  4  import os 
  5  import StringIO 
  6  import traceback 
  7   
  8  import cherrypy 
  9       
10 -def tidy(temp_dir, tidy_path, strict_xml=False, errors_to_ignore=None, 11 indent=False, wrap=False, warnings=True):
12 """Run cherrypy.response through Tidy. 13 14 If either 'indent' or 'wrap' are specified, then response.body will be 15 set to the output of tidy. Otherwise, only errors (including warnings, 16 if warnings is True) will change the body. 17 18 Note that we use the standalone Tidy tool rather than the python 19 mxTidy module. This is because this module does not seem to be 20 stable and it crashes on some HTML pages (which means that the 21 server would also crash) 22 """ 23 response = cherrypy.response 24 25 # the tidy tool, by its very nature it's not generator friendly, 26 # so we just collapse the body and work with it. 27 orig_body = response.collapse_body() 28 29 fct = response.headers.get('Content-Type', '') 30 ct = fct.split(';')[0] 31 encoding = '' 32 i = fct.find('charset=') 33 if i != -1: 34 encoding = fct[i + 8:] 35 36 if ct == 'text/html': 37 page_file = os.path.join(temp_dir, 'page.html') 38 open(page_file, 'wb').write(orig_body) 39 40 out_file = os.path.join(temp_dir, 'tidy.out') 41 err_file = os.path.join(temp_dir, 'tidy.err') 42 tidy_enc = encoding.replace('-', '') 43 if tidy_enc: 44 tidy_enc = '-' + tidy_enc 45 46 strict_xml = ("", " -xml")[bool(strict_xml)] 47 48 if indent: 49 indent = ' -indent' 50 else: 51 indent = '' 52 53 if wrap is False: 54 wrap = '' 55 else: 56 try: 57 wrap = ' -wrap %d' % int(tidyWrap) 58 except: 59 wrap = '' 60 61 result = os.system('"%s" %s%s%s%s -f %s -o %s %s' % 62 (tidy_path, tidy_enc, strict_xml, indent, wrap, 63 err_file, out_file, page_file)) 64 use_output = bool(indent or wrap) and not result 65 if use_output: 66 output = open(out_file, 'rb').read() 67 68 new_errs = [] 69 for err in open(err_file, 'rb').read().splitlines(): 70 if (err.find('Error') != -1 or 71 (warnings and err.find('Warning') != -1)): 72 ignore = 0 73 for err_ign in errors_to_ignore or []: 74 if err.find(err_ign) != -1: 75 ignore = 1 76 break 77 if not ignore: 78 new_errs.append(err) 79 80 if new_errs: 81 response.body = wrong_content('<br />'.join(new_errs), orig_body) 82 if response.headers.has_key("Content-Length"): 83 # Delete Content-Length header so finalize() recalcs it. 84 del response.headers["Content-Length"] 85 return 86 elif strict_xml: 87 # The HTML is OK, but is it valid XML? 88 # Use elementtree to parse XML 89 from elementtree.ElementTree import parse 90 tag_list = ['nbsp', 'quot'] 91 for tag in tag_list: 92 orig_body = orig_body.replace('&' + tag + ';', tag.upper()) 93 94 if encoding: 95 enctag = '<?xml version="1.0" encoding="%s"?>' % encoding 96 orig_body = enctag + orig_body 97 98 f = StringIO.StringIO(orig_body) 99 try: 100 tree = parse(f) 101 except: 102 # Wrong XML 103 body_file = StringIO.StringIO() 104 traceback.print_exc(file = body_file) 105 body_file = '<br />'.join(body_file.getvalue()) 106 response.body = wrong_content(body_file, orig_body, "XML") 107 if response.headers.has_key("Content-Length"): 108 # Delete Content-Length header so finalize() recalcs it. 109 del response.headers["Content-Length"] 110 return 111 112 if use_output: 113 response.body = [output] 114 if response.headers.has_key("Content-Length"): 115 # Delete Content-Length header so finalize() recalcs it. 116 del response.headers["Content-Length"]
117
118 -def html_space(text):
119 """Escape text, replacing space with nbsp and tab with 4 nbsp's.""" 120 return cgi.escape(text).replace('\t', ' ').replace(' ', '&nbsp;')
121
122 -def html_break(text):
123 """Escape text, replacing newline with HTML br element.""" 124 return cgi.escape(text).replace('\n', '<br />')
125
126 -def wrong_content(header, body, content_type="HTML"):
127 output = ["Wrong %s:<br />%s<br />" % (content_type, html_break(header))] 128 for i, line in enumerate(body.splitlines()): 129 output.append("%03d - %s" % (i + 1, html_space(line))) 130 return "<br />".join(output)
131 132
133 -def nsgmls(temp_dir, nsgmls_path, catalog_path, errors_to_ignore=None):
134 response = cherrypy.response 135 136 # the tidy tool, by its very nature it's not generator friendly, 137 # so we just collect the body and work with it. 138 orig_body = response.collapse_body() 139 140 fct = response.headers.get('Content-Type', '') 141 ct = fct.split(';')[0] 142 encoding = '' 143 i = fct.find('charset=') 144 if i != -1: 145 encoding = fct[i + 8:] 146 if ct == 'text/html': 147 # Remove bits of Javascript (nsgmls doesn't seem to handle 148 # them correctly (for instance, if <a appears in your 149 # Javascript code nsgmls complains about it) 150 while True: 151 i = orig_body.find('<script') 152 if i == -1: 153 break 154 j = orig_body.find('</script>', i) 155 if j == -1: 156 break 157 orig_body = orig_body[:i] + orig_body[j+9:] 158 159 page_file = os.path.join(temp_dir, 'page.html') 160 open(page_file, 'wb').write(orig_body) 161 162 err_file = os.path.join(temp_dir, 'nsgmls.err') 163 command = ('%s -c%s -f%s -s -E10 %s' % 164 (nsgmls_path, catalog_path, err_file, page_file)) 165 command = command.replace('\\', '/') 166 os.system(command) 167 errs = open(err_file, 'rb').read() 168 169 new_errs = [] 170 for err in errs.splitlines(): 171 ignore = False 172 for err_ign in errors_to_ignore or []: 173 if err.find(err_ign) != -1: 174 ignore = True 175 break 176 if not ignore: 177 new_errs.append(err) 178 179 if new_errs: 180 response.body = wrong_content('<br />'.join(new_errs), orig_body) 181 if response.headers.has_key("Content-Length"): 182 # Delete Content-Length header so finalize() recalcs it. 183 del response.headers["Content-Length"]
184