1 """The ``lxml.html`` tool set for HTML handling.
2 """
3
4 import threading
5 import re
6 import urlparse
7 import copy
8 from lxml import etree
9 from lxml.html import defs
10 from lxml import cssselect
11 from lxml.html._setmixin import SetMixin
12 try:
13 from UserDict import DictMixin
14 except ImportError:
15
16 from lxml.html._dictmixin import DictMixin
17 import sets
18
19 __all__ = [
20 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring',
21 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form',
22 'find_rel_links', 'find_class', 'make_links_absolute',
23 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser']
24
25 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]")
26
27 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]")
28 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]")
29 _collect_string_content = etree.XPath("string()")
30 _css_url_re = re.compile(r'url\((.*?)\)', re.I)
31 _css_import_re = re.compile(r'@import "(.*?)"')
32 _label_xpath = etree.XPath("//label[@for=$id]")
33 _archive_re = re.compile(r'[^ ]+')
34
36
38 """
39 Returns the base URL, given when the page was parsed.
40
41 Use with ``urlparse.urljoin(el.base_url, href)`` to get
42 absolute URLs.
43 """
44 return self.getroottree().docinfo.URL
45 base_url = property(base_url, doc=base_url.__doc__)
46
52 forms = property(forms, doc=forms.__doc__)
53
55 """
56 Return the <body> element. Can be called from a child element
57 to get the document's head.
58 """
59 return self.xpath('//body')[0]
60 body = property(body, doc=body.__doc__)
61
63 """
64 Returns the <head> element. Can be called from a child
65 element to get the document's head.
66 """
67 return self.xpath('//head')[0]
68 head = property(head, doc=head.__doc__)
69
71 """
72 Get or set any <label> element associated with this element.
73 """
74 id = self.get('id')
75 if not id:
76 return None
77 result = _label_xpath(self, id=id)
78 if not result:
79 return None
80 else:
81 return result[0]
83 id = self.get('id')
84 if not id:
85 raise TypeError(
86 "You cannot set a label for an element (%r) that has no id"
87 % self)
88 if not label.tag == 'label':
89 raise TypeError(
90 "You can only assign label to a label element (not %r)"
91 % label)
92 label.set('for', id)
94 label = self.label
95 if label is not None:
96 del label.attrib['for']
97 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__)
98
100 """
101 Removes this element from the tree, including its children and
102 text. The tail text is joined to the previous element or
103 parent.
104 """
105 parent = self.getparent()
106 assert parent is not None
107 if self.tail:
108 previous = self.getprevious()
109 if previous is None:
110 parent.text = (parent.text or '') + self.tail
111 else:
112 previous.tail = (previous.tail or '') + self.tail
113 parent.remove(self)
114
116 """
117 Remove the tag, but not its children or text. The children and text
118 are merged into the parent.
119
120 Example::
121
122 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>')
123 >>> h.find('.//b').drop_tag()
124 >>> print tostring(h)
125 <div>Hello World!</div>
126 """
127 parent = self.getparent()
128 assert parent is not None
129 previous = self.getprevious()
130 if self.text and isinstance(self.tag, basestring):
131
132 if previous is None:
133 parent.text = (parent.text or '') + self.text
134 else:
135 previous.tail = (previous.tail or '') + self.text
136 if self.tail:
137 if len(self):
138 last = self[-1]
139 last.tail = (last.tail or '') + self.tail
140 elif previous is None:
141 parent.text = (parent.text or '') + self.tail
142 else:
143 previous.tail = (previous.tail or '') + self.tail
144 index = parent.index(self)
145 parent[index:index+1] = self[:]
146
148 """
149 Find any links like ``<a rel="{rel}">...</a>``; returns a list of elements.
150 """
151 rel = rel.lower()
152 return [el for el in _rel_links_xpath(self)
153 if el.get('rel').lower() == rel]
154
156 """
157 Find any elements with the given class name.
158 """
159 return _class_xpath(self, class_name=class_name)
160
162 """
163 Get the first element in a document with the given id. If none is
164 found, return the default argument if provided or raise KeyError
165 otherwise.
166
167 Note that there can be more than one element with the same id,
168 and this isn't uncommon in HTML documents found in the wild.
169 Browsers return only the first match, and this function does
170 the same.
171 """
172 try:
173
174
175 return _id_xpath(self, id=id)[0]
176 except IndexError:
177 if default:
178 return default[0]
179 else:
180 raise KeyError, id
181
182 - def text_content(self):
183 """
184 Return the text content of the tag (and the text in any children).
185 """
186 return _collect_string_content(self)
187
189 """
190 Run the CSS expression on this element and its children,
191 returning a list of the results.
192
193 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note
194 that pre-compiling the expression can provide a substantial
195 speedup.
196 """
197 return cssselect.CSSSelector(expr)(self)
198
199
200
201
202
204 """
205 Make all links in the document absolute, given the
206 ``base_url`` for the document (the full URL where the document
207 came from), or if no ``base_url`` is given, then the ``.base_url`` of the document.
208
209 If ``resolve_base_href`` is true, then any ``<base href>``
210 tags in the document are used *and* removed from the document.
211 If it is false then any such tag is ignored.
212 """
213 if base_url is None:
214 base_url = self.base_url
215 if base_url is None:
216 raise TypeError(
217 "No base_url given, and the document has no base_url")
218 if resolve_base_href:
219 self.resolve_base_href()
220 def link_repl(href):
221 return urlparse.urljoin(base_url, href)
222 self.rewrite_links(link_repl)
223
225 """
226 Find any ``<base href>`` tag in the document, and apply its
227 values to all links found in the document. Also remove the
228 tag once it has been applied.
229 """
230 base_href = None
231 basetags = self.xpath('//base[@href]')
232 for b in basetags:
233 base_href = b.get('href')
234 b.drop_tree()
235 if not base_href:
236 return
237 self.make_links_absolute(base_href, resolve_base_href=False)
238
240 """
241 Yield (element, attribute, link, pos), where attribute may be None
242 (indicating the link is in the text). ``pos`` is the position
243 where the link occurs; often 0, but sometimes something else in
244 the case of links in stylesheets or style tags.
245
246 Note: <base href> is *not* taken into account in any way. The
247 link you get is exactly the link in the document.
248 """
249 link_attrs = defs.link_attrs
250 for el in self.getiterator():
251 attribs = el.attrib
252 if el.tag != 'object':
253 for attrib in link_attrs:
254 if attrib in attribs:
255 yield (el, attrib, attribs[attrib], 0)
256 elif el.tag == 'object':
257 codebase = None
258
259
260 if 'codebase' in attribs:
261 codebase = el.get('codebase')
262 yield (el, 'codebase', codebase, 0)
263 for attrib in 'classid', 'data':
264 if attrib in attribs:
265 value = el.get(attrib)
266 if codebase is not None:
267 value = urlparse.urljoin(codebase, value)
268 yield (el, attrib, value, 0)
269 if 'archive' in attribs:
270 for match in _archive_re.finditer(el.get('archive')):
271 value = match.group(0)
272 if codebase is not None:
273 value = urlparse.urljoin(codebase, value)
274 yield (el, 'archive', value, match.start())
275 if el.tag == 'param':
276 valuetype = el.get('valuetype') or ''
277 if valuetype.lower() == 'ref':
278
279
280
281
282
283
284 yield (el, 'value', el.get('value'), 0)
285 if el.tag == 'style' and el.text:
286 for match in _css_url_re.finditer(el.text):
287 yield (el, None, match.group(1), match.start(1))
288 for match in _css_import_re.finditer(el.text):
289 yield (el, None, match.group(1), match.start(1))
290 if 'style' in attribs:
291 for match in _css_url_re.finditer(attribs['style']):
292 yield (el, 'style', match.group(1), match.start(1))
293
294 - def rewrite_links(self, link_repl_func, resolve_base_href=True,
295 base_href=None):
296 """
297 Rewrite all the links in the document. For each link
298 ``link_repl_func(link)`` will be called, and the return value
299 will replace the old link.
300
301 Note that links may not be absolute (unless you first called
302 ``make_links_absolute()``), and may be internal (e.g.,
303 ``'#anchor'``). They can also be values like
304 ``'mailto:email'`` or ``'javascript:expr'``.
305
306 If you give ``base_href`` then all links passed to
307 ``link_repl_func()`` will take that into account.
308
309 If the ``link_repl_func`` returns None, the attribute or
310 tag text will be removed completely.
311 """
312 if base_href is not None:
313
314
315 self.make_links_absolute(base_href, resolve_base_href=resolve_base_href)
316 elif resolve_base_href:
317 self.resolve_base_href()
318 for el, attrib, link, pos in self.iterlinks():
319 new_link = link_repl_func(link)
320 if new_link == link:
321 continue
322 if new_link is None:
323
324 if attrib is None:
325 el.text = ''
326 else:
327 del el.attrib[attrib]
328 continue
329 if attrib is None:
330 new = el.text[:pos] + new_link + el.text[pos+len(link):]
331 el.text = new
332 else:
333 cur = el.attrib[attrib]
334 if not pos and len(cur) == len(link):
335
336 el.attrib[attrib] = new_link
337 else:
338 new = cur[:pos] + new_link + cur[pos+len(link):]
339 el.attrib[attrib] = new
340
341
343 """
344 An object that represents a method on an element as a function;
345 the function takes either an element or an HTML string. It
346 returns whatever the function normally returns, or if the function
347 works in-place (and so returns None) it returns a serialized form
348 of the resulting document.
349 """
350 - def __init__(self, name, copy=False, source_class=HtmlMixin):
355 if isinstance(doc, basestring):
356 if 'copy' in kw:
357 raise TypeError(
358 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name)
359 return_string = True
360 doc = fromstring(doc, **kw)
361 else:
362 if 'copy' in kw:
363 copy = kw.pop('copy')
364 else:
365 copy = self.copy
366 return_string = False
367 if copy:
368 doc = copy.deepcopy(doc)
369 meth = getattr(doc, self.name)
370 result = meth(*args, **kw)
371
372 if result is None:
373
374 if return_string:
375 return tostring(doc)
376 else:
377 return doc
378 else:
379 return result
380
381 find_rel_links = _MethodFunc('find_rel_links', copy=False)
382 find_class = _MethodFunc('find_class', copy=False)
383 make_links_absolute = _MethodFunc('make_links_absolute', copy=True)
384 resolve_base_href = _MethodFunc('resolve_base_href', copy=True)
385 iterlinks = _MethodFunc('iterlinks', copy=False)
386 rewrite_links = _MethodFunc('rewrite_links', copy=True)
387
390
393
396
399
400
402 """A lookup scheme for HTML Element classes.
403
404 To create a lookup instance with different Element classes, pass a tag
405 name mapping of Element classes in the ``classes`` keyword argument and/or
406 a tag name mapping of Mixin classes in the ``mixins`` keyword argument.
407 The special key '*' denotes a Mixin class that should be mixed into all
408 Element classes.
409 """
410 _default_element_classes = {}
411
412 - def __init__(self, classes=None, mixins=None):
413 etree.CustomElementClassLookup.__init__(self)
414 if classes is None:
415 classes = self._default_element_classes.copy()
416 if mixins:
417 mixers = {}
418 for name, value in mixins:
419 if name == '*':
420 for n in classes.keys():
421 mixers.setdefault(n, []).append(value)
422 else:
423 mixers.setdefault(name, []).append(value)
424 for name, mix_bases in mixers.items():
425 cur = classes.get(name, HtmlElement)
426 bases = tuple(mix_bases + [cur])
427 classes[name] = type(cur.__name__, bases, {})
428 self._element_classes = classes
429
430 - def lookup(self, node_type, document, namespace, name):
431 if node_type == 'element':
432 return self._element_classes.get(name.lower(), HtmlElement)
433 elif node_type == 'comment':
434 return HtmlComment
435 elif node_type == 'PI':
436 return HtmlProcessingInstruction
437 elif node_type == 'entity':
438 return HtmlEntity
439
440 return None
441
442
443
444
445
452
454 """
455 Parses several HTML elements, returning a list of elements.
456
457 The first item in the list may be a string (though leading
458 whitespace is removed). If no_leading_text is true, then it will
459 be an error if there is leading text, and it will always be a list
460 of only elements.
461
462 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
463 """
464
465 start = html[:20].lstrip().lower()
466 if not start.startswith('<html') and not start.startswith('<!doctype'):
467 html = '<html><body>%s</body></html>' % html
468 doc = document_fromstring(html, base_url=base_url, **kw)
469 assert doc.tag == 'html'
470 bodies = [e for e in doc if e.tag == 'body']
471 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html))
472 body = bodies[0]
473 elements = []
474 if no_leading_text and body.text and body.text.strip():
475 raise etree.ParserError(
476 "There is leading text: %r" % body.text)
477 if body.text and body.text.strip():
478 elements.append(body.text)
479 elements.extend(body)
480
481
482 return elements
483
485 """
486 Parses a single HTML element; it is an error if there is more than
487 one element, or if anything but whitespace precedes or follows the
488 element.
489
490 If create_parent is true (or is a tag name) then a parent node
491 will be created to encapsulate the HTML in a single element.
492
493 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
494 """
495 if create_parent:
496 if not isinstance(create_parent, basestring):
497 create_parent = 'div'
498 return fragment_fromstring('<%s>%s</%s>' % (
499 create_parent, html, create_parent), base_url=base_url, **kw)
500 elements = fragments_fromstring(html, no_leading_text=True, base_url=base_url, **kw)
501 if not elements:
502 raise etree.ParserError(
503 "No elements found")
504 if len(elements) > 1:
505 raise etree.ParserError(
506 "Multiple elements found (%s)"
507 % ', '.join([_element_name(e) for e in elements]))
508 el = elements[0]
509 if el.tail and el.tail.strip():
510 raise etree.ParserError(
511 "Element followed by text: %r" % el.tail)
512 el.tail = None
513 return el
514
516 """
517 Parse the html, returning a single element/document.
518
519 This tries to minimally parse the chunk of text, without knowing if it
520 is a fragment or a document.
521
522 base_url will set the document's base_url attribute (and the tree's docinfo.URL)
523 """
524 start = html[:10].lstrip().lower()
525 if start.startswith('<html') or start.startswith('<!doctype'):
526
527 return document_fromstring(html, base_url=base_url, **kw)
528
529 doc = document_fromstring(html, base_url=base_url, **kw)
530 bodies = doc.findall('body')
531 if bodies:
532 body = bodies[0]
533 if len(bodies) > 1:
534
535
536 for other_body in bodies[1:]:
537 if other_body.text:
538 if len(body):
539 body[-1].tail = (body[-1].tail or '') + other_body.text
540 else:
541 body.text = (body.text or '') + other_body.text
542 body.extend(other_body)
543
544
545 other_body.drop_tree()
546 else:
547 body = None
548 heads = doc.findall('head')
549 if heads:
550
551 head = heads[0]
552 if len(heads) > 1:
553 for other_head in heads[1:]:
554 head.extend(other_head)
555
556 other_head.drop_tree()
557 return doc
558 if (len(body) == 1 and (not body.text or not body.text.strip())
559 and (not body[-1].tail or not body[-1].tail.strip())):
560
561
562 return body[0]
563
564
565
566 if _contains_block_level_tag(body):
567 body.tag = 'div'
568 else:
569 body.tag = 'span'
570 return body
571
572 -def parse(filename_or_url, parser=None, base_url=None, **kw):
573 """
574 Parse a filename, URL, or file-like object into an HTML document
575 tree. Note: this returns a tree, not an element. Use
576 ``parse(...).getroot()`` to get the document root.
577
578 You can override the base URL with the ``base_url`` keyword. This
579 is most useful when parsing from a file-like object.
580 """
581 if parser is None:
582 parser = html_parser
583 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
584
586
587
588 for el in el.getiterator():
589 if el.tag in defs.block_tags:
590 return True
591 return False
592
594 if isinstance(el, etree.CommentBase):
595 return 'comment'
596 elif isinstance(el, basestring):
597 return 'string'
598 else:
599 return el.tag
600
601
602
603
604
705
706 HtmlElementClassLookup._default_element_classes['form'] = FormElement
707
740
742 import urllib
743
744 if method == 'GET':
745 if '?' in url:
746 url += '&'
747 else:
748 url += '?'
749 url += urllib.urlencode(values)
750 data = None
751 else:
752 data = urllib.urlencode(values)
753 return urllib.urlopen(url, data)
754
756
760 return self.inputs[item].value
762 self.inputs[item].value = value
764 raise KeyError(
765 "You cannot remove keys from ElementDict")
767 return self.inputs.keys()
769 return item in self.inputs
770
772 return '<%s for form %s>' % (
773 self.__class__.__name__,
774 self.inputs.form._name())
775
840
868
869 -class TextareaElement(InputMixin, HtmlElement):
870 """
871 ``<textarea>`` element. You can get the name with ``.name`` and
872 get/set the value with ``.value``
873 """
874
875 - def _value__get(self):
876 """
877 Get/set the value (which is the contents of this element)
878 """
879 return self.text or ''
880 - def _value__set(self, value):
882 - def _value__del(self):
884 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
885
886 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement
887
889 """
890 ``<select>`` element. You can get the name with ``.name``.
891
892 ``.value`` will be the value of the selected option, unless this
893 is a multi-select element (``<select multiple>``), in which case
894 it will be a set-like object. In either case ``.value_options``
895 gives the possible values.
896
897 The boolean attribute ``.multiple`` shows if this is a
898 multi-select.
899 """
900
902 """
903 Get/set the value of this select (the selected option).
904
905 If this is a multi-select, this is a set-like object that
906 represents all the selected options.
907 """
908 if self.multiple:
909 return MultipleSelectOptions(self)
910 for el in self.getiterator('option'):
911 if 'selected' in el.attrib:
912 value = el.get('value')
913
914 return value
915 return None
916
918 if self.multiple:
919 if isinstance(value, basestring):
920 raise TypeError(
921 "You must pass in a sequence")
922 self.value.clear()
923 self.value.update(value)
924 return
925 if value is not None:
926 for el in self.getiterator('option'):
927
928 if el.get('value') == value:
929 checked_option = el
930 break
931 else:
932 raise ValueError(
933 "There is no option with the value of %r" % value)
934 for el in self.getiterator('option'):
935 if 'selected' in el.attrib:
936 del el.attrib['selected']
937 if value is not None:
938 checked_option.set('selected', '')
939
941
942 if self.multiple:
943 self.value.clear()
944 else:
945 self.value = None
946
947 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
948
950 """
951 All the possible values this select can have (the ``value``
952 attribute of all the ``<option>`` elements.
953 """
954 return [el.get('value') for el in self.getiterator('option')]
955 value_options = property(value_options, doc=value_options.__doc__)
956
958 """
959 Boolean attribute: is there a ``multiple`` attribute on this element.
960 """
961 return 'multiple' in self.attrib
963 if value:
964 self.set('multiple', '')
965 elif 'multiple' in self.attrib:
966 del self.attrib['multiple']
967 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
968
969 HtmlElementClassLookup._default_element_classes['select'] = SelectElement
970
972 """
973 Represents all the selected options in a ``<select multiple>`` element.
974
975 You can add to this set-like option to select an option, or remove
976 to unselect the option.
977 """
978
981
983 """
984 Iterator of all the ``<option>`` elements.
985 """
986 return self.select.getiterator('option')
987 options = property(options)
988
990 for option in self.options:
991 yield option.get('value')
992
993 - def add(self, item):
994 for option in self.options:
995 if option.get('value') == item:
996 option.set('selected', '')
997 break
998 else:
999 raise ValueError(
1000 "There is no option with the value %r" % item)
1001
1003 for option in self.options:
1004 if option.get('value') == item:
1005 if 'selected' in option.attrib:
1006 del option.attrib['selected']
1007 else:
1008 raise ValueError(
1009 "The option %r is not currently selected" % item)
1010 break
1011 else:
1012 raise ValueError(
1013 "There is not option with the value %r" % item)
1014
1016 return '<%s {%s} for select name=%r>' % (
1017 self.__class__.__name__,
1018 ', '.join([repr(v) for v in self]),
1019 self.select.name)
1020
1022 """
1023 This object represents several ``<input type=radio>`` elements
1024 that have the same name.
1025
1026 You can use this like a list, but also use the property
1027 ``.value`` to check/uncheck inputs. Also you can use
1028 ``.value_options`` to get the possible values.
1029 """
1030
1032 """
1033 Get/set the value, which checks the radio with that value (and
1034 unchecks any other value).
1035 """
1036 for el in self:
1037 if 'checked' in el.attrib:
1038 return el.get('value')
1039 return None
1040
1042 if value is not None:
1043 for el in self:
1044 if el.get('value') == value:
1045 checked_option = el
1046 break
1047 else:
1048 raise ValueError(
1049 "There is no radio input with the value %r" % value)
1050 for el in self:
1051 if 'checked' in el.attrib:
1052 del el.attrib['checked']
1053 if value is not None:
1054 checked_option.set('checked', '')
1055
1058
1059 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1060
1062 """
1063 Returns a list of all the possible values.
1064 """
1065 return [el.get('value') for el in self]
1066 value_options = property(value_options, doc=value_options.__doc__)
1067
1069 return '%s(%s)' % (
1070 self.__class__.__name__,
1071 list.__repr__(self))
1072
1074 """
1075 Represents a group of checkboxes (``<input type=checkbox>``) that
1076 have the same name.
1077
1078 In addition to using this like a list, the ``.value`` attribute
1079 returns a set-like object that you can add to or remove from to
1080 check and uncheck checkboxes. You can also use ``.value_options``
1081 to get the possible values.
1082 """
1083
1085 """
1086 Return a set-like object that can be modified to check or
1087 uncheck individual checkboxes according to their value.
1088 """
1089 return CheckboxValues(self)
1091 self.value.clear()
1092 if not hasattr(value, '__iter__'):
1093 raise ValueError(
1094 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)"
1095 % (self[0].name, value))
1096 self.value.update(value)
1099 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1100
1102 return '%s(%s)' % (
1103 self.__class__.__name__, list.__repr__(self))
1104
1106
1107 """
1108 Represents the values of the checked checkboxes in a group of
1109 checkboxes with the same name.
1110 """
1111
1114
1116 return iter([
1117 el.get('value')
1118 for el in self.group
1119 if 'checked' in el.attrib])
1120
1121 - def add(self, value):
1122 for el in self.group:
1123 if el.get('value') == value:
1124 el.set('checked', '')
1125 break
1126 else:
1127 raise KeyError("No checkbox with value %r" % value)
1128
1130 for el in self.group:
1131 if el.get('value') == value:
1132 if 'checked' in el.attrib:
1133 del el.attrib['checked']
1134 else:
1135 raise KeyError(
1136 "The checkbox with value %r was already unchecked" % value)
1137 break
1138 else:
1139 raise KeyError(
1140 "No checkbox with value %r" % value)
1141
1143 return '<%s {%s} for checkboxes name=%r>' % (
1144 self.__class__.__name__,
1145 ', '.join([repr(v) for v in self]),
1146 self.group.name)
1147
1231
1232 HtmlElementClassLookup._default_element_classes['input'] = InputElement
1233
1235 """
1236 Represents a ``<label>`` element.
1237
1238 Label elements are linked to other elements with their ``for``
1239 attribute. You can access this element with ``label.for_element``.
1240 """
1241
1243 """
1244 Get/set the element this label points to. Return None if it
1245 can't be found.
1246 """
1247 id = self.get('for')
1248 if not id:
1249 return None
1250 return self.body.get_element_by_id(id)
1252 id = other.get('id')
1253 if not id:
1254 raise TypeError(
1255 "Element %r has no id attribute" % other)
1256 self.set('for', id)
1260 for_element = property(_for_element__get, _for_element__set, _for_element__del,
1261 doc=_for_element__get.__doc__)
1262
1263 HtmlElementClassLookup._default_element_classes['label'] = LabelElement
1264
1265
1266
1267
1268
1269
1270
1271 __replace_meta_content_type = re.compile(
1272 r'<meta http-equiv="Content-Type".*?>').sub
1273
1274 -def tostring(doc, pretty_print=False, include_meta_content_type=False,
1275 encoding=None, method="html"):
1276 """Return an HTML string representation of the document.
1277
1278 Note: if include_meta_content_type is true this will create a
1279 ``<meta http-equiv="Content-Type" ...>`` tag in the head;
1280 regardless of the value of include_meta_content_type any existing
1281 ``<meta http-equiv="Content-Type" ...>`` tag will be removed
1282
1283 The ``encoding`` argument controls the output encoding (defauts to
1284 ASCII, with &#...; character references for any characters outside
1285 of ASCII).
1286
1287 The ``method`` argument defines the output method. It defaults to
1288 'html', but can also be 'xml' for xhtml output, or 'text' to
1289 serialise to plain text without markup. Note that you can pass
1290 the builtin ``unicode`` type as ``encoding`` argument to serialise
1291 to a unicode string.
1292
1293 Example::
1294
1295 >>> from lxml import html
1296 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>')
1297
1298 >>> html.tostring(root)
1299 '<p>Hello<br>world!</p>'
1300 >>> html.tostring(root, method='html')
1301 '<p>Hello<br>world!</p>'
1302
1303 >>> html.tostring(root, method='xml')
1304 '<p>Hello<br/>world!</p>'
1305
1306 >>> html.tostring(root, method='text')
1307 'Helloworld!'
1308
1309 >>> html.tostring(root, method='text', encoding=unicode)
1310 u'Helloworld!'
1311 """
1312 html = etree.tostring(doc, method=method, pretty_print=pretty_print,
1313 encoding=encoding)
1314 if not include_meta_content_type:
1315 html = __replace_meta_content_type('', html)
1316 return html
1317
1319 """
1320 Open the HTML document in a web browser (saving it to a temporary
1321 file to open it).
1322 """
1323 import os
1324 import webbrowser
1325 try:
1326 write_doc = doc.write
1327 except AttributeError:
1328 write_doc = etree.ElementTree(element=doc).write
1329 fn = os.tempnam() + '.html'
1330 write_doc(fn, method="html")
1331 url = 'file://' + fn.replace(os.path.sep, '/')
1332 print url
1333 webbrowser.open(url)
1334
1335
1336
1337
1338
1343
1347
1348 html_parser = HTMLParser()
1349