Package lxml :: Package html
[hide private]
[frames] | no frames]

Source Code for Package lxml.html

   1  """The ``lxml.html`` tool set for HTML handling. 
   2  """ 
   3   
   4  import threading 
   5  import re 
   6  try: 
   7      from urlparse import urljoin 
   8  except ImportError: 
   9      # Python 3 
  10      from urllib.parse import urljoin 
  11  import copy 
  12  from lxml import etree 
  13  from lxml.html import defs 
  14  from lxml import cssselect 
  15  from lxml.html._setmixin import SetMixin 
  16  try: 
  17      from UserDict import DictMixin 
  18  except ImportError: 
  19      # DictMixin was introduced in Python 2.4 
  20      from lxml.html._dictmixin import DictMixin 
  21  try: 
  22      set 
  23  except NameError: 
  24      # Python 2.3 
  25      from sets import Set as set 
  26  try: 
  27      bytes = __builtins__["bytes"] 
  28  except (KeyError, NameError): 
  29      # Python < 2.6 
  30      bytes = str 
  31  try: 
  32      unicode = __builtins__["unicode"] 
  33  except (KeyError, NameError): 
  34      # Python 3 
  35      unicode = str 
  36  try: 
  37      basestring = __builtins__["basestring"] 
  38  except (KeyError, NameError): 
  39      # Python 3 
  40      basestring = (str, bytes) 
  41   
42 -def __fix_docstring(s):
43 if not s: 44 return s 45 import sys 46 if sys.version_info[0] >= 3: 47 sub = re.compile(r"^(\s*)u'", re.M).sub 48 else: 49 sub = re.compile(r"^(\s*)b'", re.M).sub 50 return sub(r"\1'", s)
51 52 __all__ = [ 53 'document_fromstring', 'fragment_fromstring', 'fragments_fromstring', 'fromstring', 54 'tostring', 'Element', 'defs', 'open_in_browser', 'submit_form', 55 'find_rel_links', 'find_class', 'make_links_absolute', 56 'resolve_base_href', 'iterlinks', 'rewrite_links', 'open_in_browser', 'parse'] 57 58 XHTML_NAMESPACE = "http://www.w3.org/1999/xhtml" 59 60 _rel_links_xpath = etree.XPath("descendant-or-self::a[@rel]|descendant-or-self::x:a[@rel]", 61 namespaces={'x':XHTML_NAMESPACE}) 62 _options_xpath = etree.XPath("descendant-or-self::option|descendant-or-self::x:option", 63 namespaces={'x':XHTML_NAMESPACE}) 64 _forms_xpath = etree.XPath("descendant-or-self::form|descendant-or-self::x:form", 65 namespaces={'x':XHTML_NAMESPACE}) 66 #_class_xpath = etree.XPath(r"descendant-or-self::*[regexp:match(@class, concat('\b', $class_name, '\b'))]", {'regexp': 'http://exslt.org/regular-expressions'}) 67 _class_xpath = etree.XPath("descendant-or-self::*[@class and contains(concat(' ', normalize-space(@class), ' '), concat(' ', $class_name, ' '))]") 68 _id_xpath = etree.XPath("descendant-or-self::*[@id=$id]") 69 _collect_string_content = etree.XPath("string()") 70 _css_url_re = re.compile(r'url\(('+'["][^"]*["]|'+"['][^']*[']|"+r'[^)]*)\)', re.I) 71 _css_import_re = re.compile(r'@import "(.*?)"') 72 _label_xpath = etree.XPath("//label[@for=$id]|//x:label[@for=$id]", 73 namespaces={'x':XHTML_NAMESPACE}) 74 _archive_re = re.compile(r'[^ ]+') 75
76 -def _unquote_match(s, pos):
77 if s[:1] == '"' and s[-1:] == '"' or s[:1] == "'" and s[-1:] == "'": 78 return s[1:-1], pos+1 79 else: 80 return s,pos
81
82 -def _transform_result(typ, result):
83 """Convert the result back into the input type. 84 """ 85 if issubclass(typ, bytes): 86 return tostring(result, encoding='utf-8') 87 elif issubclass(typ, unicode): 88 return tostring(result, encoding=unicode) 89 else: 90 return result
91
92 -def _nons(tag):
93 if isinstance(tag, basestring): 94 if tag[0] == '{' and tag[1:len(XHTML_NAMESPACE)+1] == XHTML_NAMESPACE: 95 return tag.split('}')[-1] 96 return tag
97
98 -class HtmlMixin(object):
99
100 - def base_url(self):
101 """ 102 Returns the base URL, given when the page was parsed. 103 104 Use with ``urlparse.urljoin(el.base_url, href)`` to get 105 absolute URLs. 106 """ 107 return self.getroottree().docinfo.URL
108 base_url = property(base_url, doc=base_url.__doc__) 109
110 - def forms(self):
111 """ 112 Return a list of all the forms 113 """ 114 return _forms_xpath(self)
115 forms = property(forms, doc=forms.__doc__) 116
117 - def body(self):
118 """ 119 Return the <body> element. Can be called from a child element 120 to get the document's head. 121 """ 122 return self.xpath('//body|//x:body', namespaces={'x':XHTML_NAMESPACE})[0]
123 body = property(body, doc=body.__doc__) 124
125 - def head(self):
126 """ 127 Returns the <head> element. Can be called from a child 128 element to get the document's head. 129 """ 130 return self.xpath('//head|//x:head', namespaces={'x':XHTML_NAMESPACE})[0]
131 head = property(head, doc=head.__doc__) 132
133 - def _label__get(self):
134 """ 135 Get or set any <label> element associated with this element. 136 """ 137 id = self.get('id') 138 if not id: 139 return None 140 result = _label_xpath(self, id=id) 141 if not result: 142 return None 143 else: 144 return result[0]
145 - def _label__set(self, label):
146 id = self.get('id') 147 if not id: 148 raise TypeError( 149 "You cannot set a label for an element (%r) that has no id" 150 % self) 151 if _nons(label.tag) != 'label': 152 raise TypeError( 153 "You can only assign label to a label element (not %r)" 154 % label) 155 label.set('for', id)
156 - def _label__del(self):
157 label = self.label 158 if label is not None: 159 del label.attrib['for']
160 label = property(_label__get, _label__set, _label__del, doc=_label__get.__doc__) 161
162 - def drop_tree(self):
163 """ 164 Removes this element from the tree, including its children and 165 text. The tail text is joined to the previous element or 166 parent. 167 """ 168 parent = self.getparent() 169 assert parent is not None 170 if self.tail: 171 previous = self.getprevious() 172 if previous is None: 173 parent.text = (parent.text or '') + self.tail 174 else: 175 previous.tail = (previous.tail or '') + self.tail 176 parent.remove(self)
177
178 - def drop_tag(self):
179 """ 180 Remove the tag, but not its children or text. The children and text 181 are merged into the parent. 182 183 Example:: 184 185 >>> h = fragment_fromstring('<div>Hello <b>World!</b></div>') 186 >>> h.find('.//b').drop_tag() 187 >>> print(tostring(h, encoding=unicode)) 188 <div>Hello World!</div> 189 """ 190 parent = self.getparent() 191 assert parent is not None 192 previous = self.getprevious() 193 if self.text and isinstance(self.tag, basestring): 194 # not a Comment, etc. 195 if previous is None: 196 parent.text = (parent.text or '') + self.text 197 else: 198 previous.tail = (previous.tail or '') + self.text 199 if self.tail: 200 if len(self): 201 last = self[-1] 202 last.tail = (last.tail or '') + self.tail 203 elif previous is None: 204 parent.text = (parent.text or '') + self.tail 205 else: 206 previous.tail = (previous.tail or '') + self.tail 207 index = parent.index(self) 208 parent[index:index+1] = self[:]
209 217
218 - def find_class(self, class_name):
219 """ 220 Find any elements with the given class name. 221 """ 222 return _class_xpath(self, class_name=class_name)
223
224 - def get_element_by_id(self, id, *default):
225 """ 226 Get the first element in a document with the given id. If none is 227 found, return the default argument if provided or raise KeyError 228 otherwise. 229 230 Note that there can be more than one element with the same id, 231 and this isn't uncommon in HTML documents found in the wild. 232 Browsers return only the first match, and this function does 233 the same. 234 """ 235 try: 236 # FIXME: should this check for multiple matches? 237 # browsers just return the first one 238 return _id_xpath(self, id=id)[0] 239 except IndexError: 240 if default: 241 return default[0] 242 else: 243 raise KeyError(id)
244
245 - def text_content(self):
246 """ 247 Return the text content of the tag (and the text in any children). 248 """ 249 return _collect_string_content(self)
250
251 - def cssselect(self, expr):
252 """ 253 Run the CSS expression on this element and its children, 254 returning a list of the results. 255 256 Equivalent to lxml.cssselect.CSSSelect(expr)(self) -- note 257 that pre-compiling the expression can provide a substantial 258 speedup. 259 """ 260 return cssselect.CSSSelector(expr)(self)
261 262 ######################################## 263 ## Link functions 264 ######################################## 265 285 self.rewrite_links(link_repl)
286
287 - def resolve_base_href(self):
288 """ 289 Find any ``<base href>`` tag in the document, and apply its 290 values to all links found in the document. Also remove the 291 tag once it has been applied. 292 """ 293 base_href = None 294 basetags = self.xpath('//base[@href]|//x:base[@href]', namespaces={'x':XHTML_NAMESPACE}) 295 for b in basetags: 296 base_href = b.get('href') 297 b.drop_tree() 298 if not base_href: 299 return 300 self.make_links_absolute(base_href, resolve_base_href=False)
301 380 427 428
429 -class _MethodFunc(object):
430 """ 431 An object that represents a method on an element as a function; 432 the function takes either an element or an HTML string. It 433 returns whatever the function normally returns, or if the function 434 works in-place (and so returns None) it returns a serialized form 435 of the resulting document. 436 """
437 - def __init__(self, name, copy=False, source_class=HtmlMixin):
438 self.name = name 439 self.copy = copy 440 self.__doc__ = getattr(source_class, self.name).__doc__
441 - def __call__(self, doc, *args, **kw):
442 result_type = type(doc) 443 if isinstance(doc, basestring): 444 if 'copy' in kw: 445 raise TypeError( 446 "The keyword 'copy' can only be used with element inputs to %s, not a string input" % self.name) 447 doc = fromstring(doc, **kw) 448 else: 449 if 'copy' in kw: 450 copy = kw.pop('copy') 451 else: 452 copy = self.copy 453 if copy: 454 doc = copy.deepcopy(doc) 455 meth = getattr(doc, self.name) 456 result = meth(*args, **kw) 457 # FIXME: this None test is a bit sloppy 458 if result is None: 459 # Then return what we got in 460 return _transform_result(result_type, doc) 461 else: 462 return result
463 464 find_rel_links = _MethodFunc('find_rel_links', copy=False) 465 find_class = _MethodFunc('find_class', copy=False) 466 make_links_absolute = _MethodFunc('make_links_absolute', copy=True) 467 resolve_base_href = _MethodFunc('resolve_base_href', copy=True) 468 iterlinks = _MethodFunc('iterlinks', copy=False) 469 rewrite_links = _MethodFunc('rewrite_links', copy=True) 470
471 -class HtmlComment(etree.CommentBase, HtmlMixin):
472 pass
473
474 -class HtmlElement(etree.ElementBase, HtmlMixin):
475 pass
476
477 -class HtmlProcessingInstruction(etree.PIBase, HtmlMixin):
478 pass
479
480 -class HtmlEntity(etree.EntityBase, HtmlMixin):
481 pass
482 483
484 -class HtmlElementClassLookup(etree.CustomElementClassLookup):
485 """A lookup scheme for HTML Element classes. 486 487 To create a lookup instance with different Element classes, pass a tag 488 name mapping of Element classes in the ``classes`` keyword argument and/or 489 a tag name mapping of Mixin classes in the ``mixins`` keyword argument. 490 The special key '*' denotes a Mixin class that should be mixed into all 491 Element classes. 492 """ 493 _default_element_classes = {} 494
495 - def __init__(self, classes=None, mixins=None):
496 etree.CustomElementClassLookup.__init__(self) 497 if classes is None: 498 classes = self._default_element_classes.copy() 499 if mixins: 500 mixers = {} 501 for name, value in mixins: 502 if name == '*': 503 for n in classes.keys(): 504 mixers.setdefault(n, []).append(value) 505 else: 506 mixers.setdefault(name, []).append(value) 507 for name, mix_bases in mixers.items(): 508 cur = classes.get(name, HtmlElement) 509 bases = tuple(mix_bases + [cur]) 510 classes[name] = type(cur.__name__, bases, {}) 511 self._element_classes = classes
512
513 - def lookup(self, node_type, document, namespace, name):
514 if node_type == 'element': 515 return self._element_classes.get(name.lower(), HtmlElement) 516 elif node_type == 'comment': 517 return HtmlComment 518 elif node_type == 'PI': 519 return HtmlProcessingInstruction 520 elif node_type == 'entity': 521 return HtmlEntity 522 # Otherwise normal lookup 523 return None
524 525 ################################################################################ 526 # parsing 527 ################################################################################ 528
529 -def document_fromstring(html, parser=None, **kw):
530 if parser is None: 531 parser = html_parser 532 value = etree.fromstring(html, parser, **kw) 533 if value is None: 534 raise etree.ParserError( 535 "Document is empty") 536 return value
537
538 -def fragments_fromstring(html, no_leading_text=False, base_url=None, 539 parser=None, **kw):
540 """ 541 Parses several HTML elements, returning a list of elements. 542 543 The first item in the list may be a string (though leading 544 whitespace is removed). If no_leading_text is true, then it will 545 be an error if there is leading text, and it will always be a list 546 of only elements. 547 548 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 549 """ 550 if parser is None: 551 parser = html_parser 552 # FIXME: check what happens when you give html with a body, head, etc. 553 start = html[:20].lstrip().lower() 554 if not start.startswith('<html') and not start.startswith('<!doctype'): 555 html = '<html><body>%s</body></html>' % html 556 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 557 assert _nons(doc.tag) == 'html' 558 bodies = [e for e in doc if _nons(e.tag) == 'body'] 559 assert len(bodies) == 1, ("too many bodies: %r in %r" % (bodies, html)) 560 body = bodies[0] 561 elements = [] 562 if no_leading_text and body.text and body.text.strip(): 563 raise etree.ParserError( 564 "There is leading text: %r" % body.text) 565 if body.text and body.text.strip(): 566 elements.append(body.text) 567 elements.extend(body) 568 # FIXME: removing the reference to the parent artificial document 569 # would be nice 570 return elements
571
572 -def fragment_fromstring(html, create_parent=False, base_url=None, 573 parser=None, **kw):
574 """ 575 Parses a single HTML element; it is an error if there is more than 576 one element, or if anything but whitespace precedes or follows the 577 element. 578 579 If create_parent is true (or is a tag name) then a parent node 580 will be created to encapsulate the HTML in a single element. In 581 this case, leading or trailing text is allowed. 582 583 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 584 """ 585 if parser is None: 586 parser = html_parser 587 588 accept_leading_text = bool(create_parent) 589 590 elements = fragments_fromstring( 591 html, parser=parser, no_leading_text=not accept_leading_text, 592 base_url=base_url, **kw) 593 594 if create_parent: 595 if not isinstance(create_parent, basestring): 596 create_parent = 'div' 597 new_root = Element(create_parent) 598 if elements: 599 if isinstance(elements[0], basestring): 600 new_root.text = elements[0] 601 del elements[0] 602 new_root.extend(elements) 603 return new_root 604 605 if not elements: 606 raise etree.ParserError('No elements found') 607 if len(elements) > 1: 608 raise etree.ParserError( 609 "Multiple elements found (%s)" 610 % ', '.join([_element_name(e) for e in elements])) 611 el = elements[0] 612 if el.tail and el.tail.strip(): 613 raise etree.ParserError( 614 "Element followed by text: %r" % el.tail) 615 el.tail = None 616 return el
617
618 -def fromstring(html, base_url=None, parser=None, **kw):
619 """ 620 Parse the html, returning a single element/document. 621 622 This tries to minimally parse the chunk of text, without knowing if it 623 is a fragment or a document. 624 625 base_url will set the document's base_url attribute (and the tree's docinfo.URL) 626 """ 627 if parser is None: 628 parser = html_parser 629 start = html[:10].lstrip().lower() 630 if start.startswith('<html') or start.startswith('<!doctype'): 631 # Looks like a full HTML document 632 return document_fromstring(html, parser=parser, base_url=base_url, **kw) 633 # otherwise, lets parse it out... 634 doc = document_fromstring(html, parser=parser, base_url=base_url, **kw) 635 bodies = doc.findall('body') 636 if not bodies: 637 bodies = doc.findall('{%s}body' % XHTML_NAMESPACE) 638 if bodies: 639 body = bodies[0] 640 if len(bodies) > 1: 641 # Somehow there are multiple bodies, which is bad, but just 642 # smash them into one body 643 for other_body in bodies[1:]: 644 if other_body.text: 645 if len(body): 646 body[-1].tail = (body[-1].tail or '') + other_body.text 647 else: 648 body.text = (body.text or '') + other_body.text 649 body.extend(other_body) 650 # We'll ignore tail 651 # I guess we are ignoring attributes too 652 other_body.drop_tree() 653 else: 654 body = None 655 heads = doc.findall('head') 656 if not heads: 657 heads = doc.findall('{%s}head' % XHTML_NAMESPACE) 658 if heads: 659 # Well, we have some sort of structure, so lets keep it all 660 head = heads[0] 661 if len(heads) > 1: 662 for other_head in heads[1:]: 663 head.extend(other_head) 664 # We don't care about text or tail in a head 665 other_head.drop_tree() 666 return doc 667 if (len(body) == 1 and (not body.text or not body.text.strip()) 668 and (not body[-1].tail or not body[-1].tail.strip())): 669 # The body has just one element, so it was probably a single 670 # element passed in 671 return body[0] 672 # Now we have a body which represents a bunch of tags which have the 673 # content that was passed in. We will create a fake container, which 674 # is the body tag, except <body> implies too much structure. 675 if _contains_block_level_tag(body): 676 body.tag = 'div' 677 else: 678 body.tag = 'span' 679 return body
680
681 -def parse(filename_or_url, parser=None, base_url=None, **kw):
682 """ 683 Parse a filename, URL, or file-like object into an HTML document 684 tree. Note: this returns a tree, not an element. Use 685 ``parse(...).getroot()`` to get the document root. 686 687 You can override the base URL with the ``base_url`` keyword. This 688 is most useful when parsing from a file-like object. 689 """ 690 if parser is None: 691 parser = html_parser 692 return etree.parse(filename_or_url, parser, base_url=base_url, **kw)
693
694 -def _contains_block_level_tag(el):
695 # FIXME: I could do this with XPath, but would that just be 696 # unnecessarily slow? 697 for el in el.iter(): 698 if _nons(el.tag) in defs.block_tags: 699 return True 700 return False
701
702 -def _element_name(el):
703 if isinstance(el, etree.CommentBase): 704 return 'comment' 705 elif isinstance(el, basestring): 706 return 'string' 707 else: 708 return _nons(el.tag)
709 710 ################################################################################ 711 # form handling 712 ################################################################################ 713
714 -class FormElement(HtmlElement):
715 """ 716 Represents a <form> element. 717 """ 718
719 - def inputs(self):
720 """ 721 Returns an accessor for all the input elements in the form. 722 723 See `InputGetter` for more information about the object. 724 """ 725 return InputGetter(self)
726 inputs = property(inputs, doc=inputs.__doc__) 727
728 - def _fields__get(self):
729 """ 730 Dictionary-like object that represents all the fields in this 731 form. You can set values in this dictionary to effect the 732 form. 733 """ 734 return FieldsDict(self.inputs)
735 - def _fields__set(self, value):
736 prev_keys = self.fields.keys() 737 for key, value in value.iteritems(): 738 if key in prev_keys: 739 prev_keys.remove(key) 740 self.fields[key] = value 741 for key in prev_keys: 742 if key is None: 743 # Case of an unnamed input; these aren't really 744 # expressed in form_values() anyway. 745 continue 746 self.fields[key] = None
747 748 fields = property(_fields__get, _fields__set, doc=_fields__get.__doc__) 749
750 - def _name(self):
751 if self.get('name'): 752 return self.get('name') 753 elif self.get('id'): 754 return '#' + self.get('id') 755 forms = list(self.body.iter('form')) 756 if not forms: 757 forms = list(self.body.iter('{%s}form' % XHTML_NAMESPACE)) 758 return str(forms.index(self))
759
760 - def form_values(self):
761 """ 762 Return a list of tuples of the field values for the form. 763 This is suitable to be passed to ``urllib.urlencode()``. 764 """ 765 results = [] 766 for el in self.inputs: 767 name = el.name 768 if not name: 769 continue 770 tag = _nons(el.tag) 771 if tag == 'textarea': 772 results.append((name, el.value)) 773 elif tag == 'select': 774 value = el.value 775 if el.multiple: 776 for v in value: 777 results.append((name, v)) 778 elif value is not None: 779 results.append((name, el.value)) 780 else: 781 assert tag == 'input', ( 782 "Unexpected tag: %r" % el) 783 if el.checkable and not el.checked: 784 continue 785 if el.type in ('submit', 'image', 'reset'): 786 continue 787 value = el.value 788 if value is not None: 789 results.append((name, el.value)) 790 return results
791
792 - def _action__get(self):
793 """ 794 Get/set the form's ``action`` attribute. 795 """ 796 base_url = self.base_url 797 action = self.get('action') 798 if base_url and action is not None: 799 return urljoin(base_url, action) 800 else: 801 return action
802 - def _action__set(self, value):
803 self.set('action', value)
804 - def _action__del(self):
805 if 'action' in self.attrib: 806 del self.attrib['action']
807 action = property(_action__get, _action__set, _action__del, doc=_action__get.__doc__) 808
809 - def _method__get(self):
810 """ 811 Get/set the form's method. Always returns a capitalized 812 string, and defaults to ``'GET'`` 813 """ 814 return self.get('method', 'GET').upper()
815 - def _method__set(self, value):
816 self.set('method', value.upper())
817 method = property(_method__get, _method__set, doc=_method__get.__doc__)
818 819 HtmlElementClassLookup._default_element_classes['form'] = FormElement 820
821 -def submit_form(form, extra_values=None, open_http=None):
822 """ 823 Helper function to submit a form. Returns a file-like object, as from 824 ``urllib.urlopen()``. This object also has a ``.geturl()`` function, 825 which shows the URL if there were any redirects. 826 827 You can use this like:: 828 829 form = doc.forms[0] 830 form.inputs['foo'].value = 'bar' # etc 831 response = form.submit() 832 doc = parse(response) 833 doc.make_links_absolute(response.geturl()) 834 835 To change the HTTP requester, pass a function as ``open_http`` keyword 836 argument that opens the URL for you. The function must have the following 837 signature:: 838 839 open_http(method, URL, values) 840 841 The action is one of 'GET' or 'POST', the URL is the target URL as a 842 string, and the values are a sequence of ``(name, value)`` tuples with the 843 form data. 844 """ 845 values = form.form_values() 846 if extra_values: 847 if hasattr(extra_values, 'items'): 848 extra_values = extra_values.items() 849 values.extend(extra_values) 850 if open_http is None: 851 open_http = open_http_urllib 852 if form.action: 853 url = form.action 854 else: 855 url = form.base_url 856 return open_http(form.method, url, values)
857
858 -def open_http_urllib(method, url, values):
859 if not url: 860 raise ValueError("cannot submit, no URL provided") 861 ## FIXME: should test that it's not a relative URL or something 862 try: 863 from urllib import urlencode, urlopen 864 except ImportError: # Python 3 865 from urllib.request import urlopen 866 from urllib.parse import urlencode 867 if method == 'GET': 868 if '?' in url: 869 url += '&' 870 else: 871 url += '?' 872 url += urlencode(values) 873 data = None 874 else: 875 data = urlencode(values) 876 return urlopen(url, data)
877
878 -class FieldsDict(DictMixin):
879
880 - def __init__(self, inputs):
881 self.inputs = inputs
882 - def __getitem__(self, item):
883 return self.inputs[item].value
884 - def __setitem__(self, item, value):
885 self.inputs[item].value = value
886 - def __delitem__(self, item):
887 raise KeyError( 888 "You cannot remove keys from ElementDict")
889 - def keys(self):
890 return self.inputs.keys()
891 - def __contains__(self, item):
892 return item in self.inputs
893
894 - def __repr__(self):
895 return '<%s for form %s>' % ( 896 self.__class__.__name__, 897 self.inputs.form._name())
898
899 -class InputGetter(object):
900 901 """ 902 An accessor that represents all the input fields in a form. 903 904 You can get fields by name from this, with 905 ``form.inputs['field_name']``. If there are a set of checkboxes 906 with the same name, they are returned as a list (a `CheckboxGroup` 907 which also allows value setting). Radio inputs are handled 908 similarly. 909 910 You can also iterate over this to get all input elements. This 911 won't return the same thing as if you get all the names, as 912 checkboxes and radio elements are returned individually. 913 """ 914 915 _name_xpath = etree.XPath(".//*[@name = $name and (local-name(.) = 'select' or local-name(.) = 'input' or local-name(.) = 'textarea')]") 916 _all_xpath = etree.XPath(".//*[local-name() = 'select' or local-name() = 'input' or local-name() = 'textarea']") 917
918 - def __init__(self, form):
919 self.form = form
920
921 - def __repr__(self):
922 return '<%s for form %s>' % ( 923 self.__class__.__name__, 924 self.form._name())
925 926 ## FIXME: there should be more methods, and it's unclear if this is 927 ## a dictionary-like object or list-like object 928
929 - def __getitem__(self, name):
930 results = self._name_xpath(self.form, name=name) 931 if results: 932 type = results[0].get('type') 933 if type == 'radio' and len(results) > 1: 934 group = RadioGroup(results) 935 group.name = name 936 return group 937 elif type == 'checkbox' and len(results) > 1: 938 group = CheckboxGroup(results) 939 group.name = name 940 return group 941 else: 942 # I don't like throwing away elements like this 943 return results[0] 944 else: 945 raise KeyError( 946 "No input element with the name %r" % name)
947
948 - def __contains__(self, name):
949 results = self._name_xpath(self.form, name=name) 950 return bool(results)
951
952 - def keys(self):
953 names = set() 954 for el in self: 955 names.add(el.name) 956 if None in names: 957 names.remove(None) 958 return list(names)
959
960 - def __iter__(self):
961 ## FIXME: kind of dumb to turn a list into an iterator, only 962 ## to have it likely turned back into a list again :( 963 return iter(self._all_xpath(self.form))
964
965 -class InputMixin(object):
966 967 """ 968 Mix-in for all input elements (input, select, and textarea) 969 """ 970 971
972 - def _name__get(self):
973 """ 974 Get/set the name of the element 975 """ 976 return self.get('name')
977 - def _name__set(self, value):
978 self.set('name', value)
979 - def _name__del(self):
980 if 'name' in self.attrib: 981 del self.attrib['name']
982 name = property(_name__get, _name__set, _name__del, doc=_name__get.__doc__) 983
984 - def __repr__(self):
985 type = getattr(self, 'type', None) 986 if type: 987 type = ' type=%r' % type 988 else: 989 type = '' 990 return '<%s %x name=%r%s>' % ( 991 self.__class__.__name__, id(self), self.name, type)
992
993 -class TextareaElement(InputMixin, HtmlElement):
994 """ 995 ``<textarea>`` element. You can get the name with ``.name`` and 996 get/set the value with ``.value`` 997 """ 998
999 - def _value__get(self):
1000 """ 1001 Get/set the value (which is the contents of this element) 1002 """ 1003 content = self.text or '' 1004 if self.tag.startswith("{%s}" % XHTML_NAMESPACE): 1005 serialisation_method = 'xml' 1006 else: 1007 serialisation_method = 'html' 1008 for el in self: 1009 # it's rare that we actually get here, so let's not use ''.join() 1010 content += etree.tostring(el, method=serialisation_method, encoding=unicode) 1011 return content
1012 - def _value__set(self, value):
1013 del self[:] 1014 self.text = value
1015 - def _value__del(self):
1016 self.text = '' 1017 del self[:]
1018 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__)
1019 1020 HtmlElementClassLookup._default_element_classes['textarea'] = TextareaElement 1021
1022 -class SelectElement(InputMixin, HtmlElement):
1023 """ 1024 ``<select>`` element. You can get the name with ``.name``. 1025 1026 ``.value`` will be the value of the selected option, unless this 1027 is a multi-select element (``<select multiple>``), in which case 1028 it will be a set-like object. In either case ``.value_options`` 1029 gives the possible values. 1030 1031 The boolean attribute ``.multiple`` shows if this is a 1032 multi-select. 1033 """ 1034
1035 - def _value__get(self):
1036 """ 1037 Get/set the value of this select (the selected option). 1038 1039 If this is a multi-select, this is a set-like object that 1040 represents all the selected options. 1041 """ 1042 if self.multiple: 1043 return MultipleSelectOptions(self) 1044 for el in _options_xpath(self): 1045 if el.get('selected') is not None: 1046 value = el.get('value') 1047 if value is None: 1048 value = el.text or '' 1049 if value: 1050 value = value.strip() 1051 return value 1052 return None
1053
1054 - def _value__set(self, value):
1055 if self.multiple: 1056 if isinstance(value, basestring): 1057 raise TypeError( 1058 "You must pass in a sequence") 1059 self.value.clear() 1060 self.value.update(value) 1061 return 1062 if value is not None: 1063 value = value.strip() 1064 for el in _options_xpath(self): 1065 opt_value = el.get('value') 1066 if opt_value is None: 1067 opt_value = el.text or '' 1068 if opt_value: 1069 opt_value = opt_value.strip() 1070 if opt_value == value: 1071 checked_option = el 1072 break 1073 else: 1074 raise ValueError( 1075 "There is no option with the value of %r" % value) 1076 for el in _options_xpath(self): 1077 if 'selected' in el.attrib: 1078 del el.attrib['selected'] 1079 if value is not None: 1080 checked_option.set('selected', '')
1081
1082 - def _value__del(self):
1083 # FIXME: should del be allowed at all? 1084 if self.multiple: 1085 self.value.clear() 1086 else: 1087 self.value = None
1088 1089 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1090
1091 - def value_options(self):
1092 """ 1093 All the possible values this select can have (the ``value`` 1094 attribute of all the ``<option>`` elements. 1095 """ 1096 options = [] 1097 for el in _options_xpath(self): 1098 value = el.get('value') 1099 if value is None: 1100 value = el.text or '' 1101 if value: 1102 value = value.strip() 1103 options.append(value) 1104 return options
1105 value_options = property(value_options, doc=value_options.__doc__) 1106
1107 - def _multiple__get(self):
1108 """ 1109 Boolean attribute: is there a ``multiple`` attribute on this element. 1110 """ 1111 return 'multiple' in self.attrib
1112 - def _multiple__set(self, value):
1113 if value: 1114 self.set('multiple', '') 1115 elif 'multiple' in self.attrib: 1116 del self.attrib['multiple']
1117 multiple = property(_multiple__get, _multiple__set, doc=_multiple__get.__doc__)
1118 1119 HtmlElementClassLookup._default_element_classes['select'] = SelectElement 1120
1121 -class MultipleSelectOptions(SetMixin):
1122 """ 1123 Represents all the selected options in a ``<select multiple>`` element. 1124 1125 You can add to this set-like option to select an option, or remove 1126 to unselect the option. 1127 """ 1128
1129 - def __init__(self, select):
1130 self.select = select
1131
1132 - def options(self):
1133 """ 1134 Iterator of all the ``<option>`` elements. 1135 """ 1136 return iter(_options_xpath(self.select))
1137 options = property(options) 1138
1139 - def __iter__(self):
1140 for option in self.options: 1141 yield option.get('value')
1142
1143 - def add(self, item):
1144 for option in self.options: 1145 if option.get('value') == item: 1146 option.set('selected', '') 1147 break 1148 else: 1149 raise ValueError( 1150 "There is no option with the value %r" % item)
1151
1152 - def remove(self, item):
1153 for option in self.options: 1154 if option.get('value') == item: 1155 if 'selected' in option.attrib: 1156 del option.attrib['selected'] 1157 else: 1158 raise ValueError( 1159 "The option %r is not currently selected" % item) 1160 break 1161 else: 1162 raise ValueError( 1163 "There is not option with the value %r" % item)
1164
1165 - def __repr__(self):
1166 return '<%s {%s} for select name=%r>' % ( 1167 self.__class__.__name__, 1168 ', '.join([repr(v) for v in self]), 1169 self.select.name)
1170
1171 -class RadioGroup(list):
1172 """ 1173 This object represents several ``<input type=radio>`` elements 1174 that have the same name. 1175 1176 You can use this like a list, but also use the property 1177 ``.value`` to check/uncheck inputs. Also you can use 1178 ``.value_options`` to get the possible values. 1179 """ 1180
1181 - def _value__get(self):
1182 """ 1183 Get/set the value, which checks the radio with that value (and 1184 unchecks any other value). 1185 """ 1186 for el in self: 1187 if 'checked' in el.attrib: 1188 return el.get('value') 1189 return None
1190
1191 - def _value__set(self, value):
1192 if value is not None: 1193 for el in self: 1194 if el.get('value') == value: 1195 checked_option = el 1196 break 1197 else: 1198 raise ValueError( 1199 "There is no radio input with the value %r" % value) 1200 for el in self: 1201 if 'checked' in el.attrib: 1202 del el.attrib['checked'] 1203 if value is not None: 1204 checked_option.set('checked', '')
1205
1206 - def _value__del(self):
1207 self.value = None
1208 1209 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1210
1211 - def value_options(self):
1212 """ 1213 Returns a list of all the possible values. 1214 """ 1215 return [el.get('value') for el in self]
1216 value_options = property(value_options, doc=value_options.__doc__) 1217
1218 - def __repr__(self):
1219 return '%s(%s)' % ( 1220 self.__class__.__name__, 1221 list.__repr__(self))
1222
1223 -class CheckboxGroup(list):
1224 """ 1225 Represents a group of checkboxes (``<input type=checkbox>``) that 1226 have the same name. 1227 1228 In addition to using this like a list, the ``.value`` attribute 1229 returns a set-like object that you can add to or remove from to 1230 check and uncheck checkboxes. You can also use ``.value_options`` 1231 to get the possible values. 1232 """ 1233
1234 - def _value__get(self):
1235 """ 1236 Return a set-like object that can be modified to check or 1237 uncheck individual checkboxes according to their value. 1238 """ 1239 return CheckboxValues(self)
1240 - def _value__set(self, value):
1241 self.value.clear() 1242 if not hasattr(value, '__iter__'): 1243 raise ValueError( 1244 "A CheckboxGroup (name=%r) must be set to a sequence (not %r)" 1245 % (self[0].name, value)) 1246 self.value.update(value)
1247 - def _value__del(self):
1248 self.value.clear()
1249 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1250
1251 - def __repr__(self):
1252 return '%s(%s)' % ( 1253 self.__class__.__name__, list.__repr__(self))
1254
1255 -class CheckboxValues(SetMixin):
1256 1257 """ 1258 Represents the values of the checked checkboxes in a group of 1259 checkboxes with the same name. 1260 """ 1261
1262 - def __init__(self, group):
1263 self.group = group
1264
1265 - def __iter__(self):
1266 return iter([ 1267 el.get('value') 1268 for el in self.group 1269 if 'checked' in el.attrib])
1270
1271 - def add(self, value):
1272 for el in self.group: 1273 if el.get('value') == value: 1274 el.set('checked', '') 1275 break 1276 else: 1277 raise KeyError("No checkbox with value %r" % value)
1278
1279 - def remove(self, value):
1280 for el in self.group: 1281 if el.get('value') == value: 1282 if 'checked' in el.attrib: 1283 del el.attrib['checked'] 1284 else: 1285 raise KeyError( 1286 "The checkbox with value %r was already unchecked" % value) 1287 break 1288 else: 1289 raise KeyError( 1290 "No checkbox with value %r" % value)
1291
1292 - def __repr__(self):
1293 return '<%s {%s} for checkboxes name=%r>' % ( 1294 self.__class__.__name__, 1295 ', '.join([repr(v) for v in self]), 1296 self.group.name)
1297
1298 -class InputElement(InputMixin, HtmlElement):
1299 """ 1300 Represents an ``<input>`` element. 1301 1302 You can get the type with ``.type`` (which is lower-cased and 1303 defaults to ``'text'``). 1304 1305 Also you can get and set the value with ``.value`` 1306 1307 Checkboxes and radios have the attribute ``input.checkable == 1308 True`` (for all others it is false) and a boolean attribute 1309 ``.checked``. 1310 1311 """ 1312 1313 ## FIXME: I'm a little uncomfortable with the use of .checked
1314 - def _value__get(self):
1315 """ 1316 Get/set the value of this element, using the ``value`` attribute. 1317 1318 Also, if this is a checkbox and it has no value, this defaults 1319 to ``'on'``. If it is a checkbox or radio that is not 1320 checked, this returns None. 1321 """ 1322 if self.checkable: 1323 if self.checked: 1324 return self.get('value') or 'on' 1325 else: 1326 return None 1327 return self.get('value')
1328 - def _value__set(self, value):
1329 if self.checkable: 1330 if not value: 1331 self.checked = False 1332 else: 1333 self.checked = True 1334 if isinstance(value, basestring): 1335 self.set('value', value) 1336 else: 1337 self.set('value', value)
1338 - def _value__del(self):
1339 if self.checkable: 1340 self.checked = False 1341 else: 1342 if 'value' in self.attrib: 1343 del self.attrib['value']
1344 value = property(_value__get, _value__set, _value__del, doc=_value__get.__doc__) 1345
1346 - def _type__get(self):
1347 """ 1348 Return the type of this element (using the type attribute). 1349 """ 1350 return self.get('type', 'text').lower()
1351 - def _type__set(self, value):
1352 self.set('type', value)
1353 type = property(_type__get, _type__set, doc=_type__get.__doc__) 1354
1355 - def checkable(self):
1356 """ 1357 Boolean: can this element be checked? 1358 """ 1359 return self.type in ['checkbox', 'radio']
1360 checkable = property(checkable, doc=checkable.__doc__) 1361
1362 - def _checked__get(self):
1363 """ 1364 Boolean attribute to get/set the presence of the ``checked`` 1365 attribute. 1366 1367 You can only use this on checkable input types. 1368 """ 1369 if not self.checkable: 1370 raise AttributeError('Not a checkable input type') 1371 return 'checked' in self.attrib
1372 - def _checked__set(self, value):
1373 if not self.checkable: 1374 raise AttributeError('Not a checkable input type') 1375 if value: 1376 self.set('checked', '') 1377 else: 1378 if 'checked' in self.attrib: 1379 del self.attrib['checked']
1380 checked = property(_checked__get, _checked__set, doc=_checked__get.__doc__)
1381 1382 HtmlElementClassLookup._default_element_classes['input'] = InputElement 1383
1384 -class LabelElement(HtmlElement):
1385 """ 1386 Represents a ``<label>`` element. 1387 1388 Label elements are linked to other elements with their ``for`` 1389 attribute. You can access this element with ``label.for_element``. 1390 """ 1391
1392 - def _for_element__get(self):
1393 """ 1394 Get/set the element this label points to. Return None if it 1395 can't be found. 1396 """ 1397 id = self.get('for') 1398 if not id: 1399 return None 1400 return self.body.get_element_by_id(id)
1401 - def _for_element__set(self, other):
1402 id = other.get('id') 1403 if not id: 1404 raise TypeError( 1405 "Element %r has no id attribute" % other) 1406 self.set('for', id)
1407 - def _for_element__del(self):
1408 if 'id' in self.attrib: 1409 del self.attrib['id']
1410 for_element = property(_for_element__get, _for_element__set, _for_element__del, 1411 doc=_for_element__get.__doc__)
1412 1413 HtmlElementClassLookup._default_element_classes['label'] = LabelElement 1414 1415 ############################################################ 1416 ## Serialization 1417 ############################################################ 1418
1419 -def html_to_xhtml(html):
1420 """Convert all tags in an HTML tree to XHTML by moving them to the 1421 XHTML namespace. 1422 """ 1423 try: 1424 html = html.getroot() 1425 except AttributeError: 1426 pass 1427 prefix = "{%s}" % XHTML_NAMESPACE 1428 for el in html.iter(): 1429 tag = el.tag 1430 if isinstance(tag, basestring): 1431 if tag[0] != '{': 1432 el.tag = prefix + tag
1433
1434 -def xhtml_to_html(xhtml):
1435 """Convert all tags in an XHTML tree to HTML by removing their 1436 XHTML namespace. 1437 """ 1438 try: 1439 xhtml = xhtml.getroot() 1440 except AttributeError: 1441 pass 1442 prefix = "{%s}" % XHTML_NAMESPACE 1443 prefix_len = len(prefix) 1444 for el in xhtml.iter(prefix + "*"): 1445 el.tag = el.tag[prefix_len:]
1446 1447 # This isn't a general match, but it's a match for what libxml2 1448 # specifically serialises: 1449 __str_replace_meta_content_type = re.compile( 1450 r'<meta http-equiv="Content-Type"[^>]*>').sub 1451 __bytes_replace_meta_content_type = re.compile( 1452 r'<meta http-equiv="Content-Type"[^>]*>'.encode('ASCII')).sub 1453
1454 -def tostring(doc, pretty_print=False, include_meta_content_type=False, 1455 encoding=None, method="html"):
1456 """Return an HTML string representation of the document. 1457 1458 Note: if include_meta_content_type is true this will create a 1459 ``<meta http-equiv="Content-Type" ...>`` tag in the head; 1460 regardless of the value of include_meta_content_type any existing 1461 ``<meta http-equiv="Content-Type" ...>`` tag will be removed 1462 1463 The ``encoding`` argument controls the output encoding (defauts to 1464 ASCII, with &#...; character references for any characters outside 1465 of ASCII). 1466 1467 The ``method`` argument defines the output method. It defaults to 1468 'html', but can also be 'xml' for xhtml output, or 'text' to 1469 serialise to plain text without markup. Note that you can pass 1470 the builtin ``unicode`` type as ``encoding`` argument to serialise 1471 to a unicode string. 1472 1473 Example:: 1474 1475 >>> from lxml import html 1476 >>> root = html.fragment_fromstring('<p>Hello<br>world!</p>') 1477 1478 >>> html.tostring(root) 1479 b'<p>Hello<br>world!</p>' 1480 >>> html.tostring(root, method='html') 1481 b'<p>Hello<br>world!</p>' 1482 1483 >>> html.tostring(root, method='xml') 1484 b'<p>Hello<br/>world!</p>' 1485 1486 >>> html.tostring(root, method='text') 1487 b'Helloworld!' 1488 1489 >>> html.tostring(root, method='text', encoding=unicode) 1490 u'Helloworld!' 1491 """ 1492 html = etree.tostring(doc, method=method, pretty_print=pretty_print, 1493 encoding=encoding) 1494 if method == 'html' and not include_meta_content_type: 1495 if isinstance(html, str): 1496 html = __str_replace_meta_content_type('', html) 1497 else: 1498 html = __bytes_replace_meta_content_type(bytes(), html) 1499 return html
1500 1501 tostring.__doc__ = __fix_docstring(tostring.__doc__) 1502
1503 -def open_in_browser(doc, encoding=None):
1504 """ 1505 Open the HTML document in a web browser, saving it to a temporary 1506 file to open it. Note that this does not delete the file after 1507 use. This is mainly meant for debugging. 1508 """ 1509 import os 1510 import webbrowser 1511 import tempfile 1512 if not isinstance(doc, etree._ElementTree): 1513 doc = etree.ElementTree(doc) 1514 handle, fn = tempfile.mkstemp(suffix='.html') 1515 f = os.fdopen(handle, 'wb') 1516 try: 1517 doc.write(f, method="html", encoding=encoding or doc.docinfo.encoding or "UTF-8") 1518 finally: 1519 # we leak the file itself here, but we should at least close it 1520 f.close() 1521 url = 'file://' + fn.replace(os.path.sep, '/') 1522 print(url) 1523 webbrowser.open(url)
1524 1525 ################################################################################ 1526 # configure Element class lookup 1527 ################################################################################ 1528
1529 -class HTMLParser(etree.HTMLParser):
1530 """An HTML parser that is configured to return lxml.html Element 1531 objects. 1532 """
1533 - def __init__(self, **kwargs):
1534 super(HTMLParser, self).__init__(**kwargs) 1535 self.set_element_class_lookup(HtmlElementClassLookup())
1536
1537 -class XHTMLParser(etree.XMLParser):
1538 """An XML parser that is configured to return lxml.html Element 1539 objects. 1540 1541 Note that this parser is not really XHTML aware unless you let it 1542 load a DTD that declares the HTML entities. To do this, make sure 1543 you have the XHTML DTDs installed in your catalogs, and create the 1544 parser like this:: 1545 1546 >>> parser = XHTMLParser(load_dtd=True) 1547 1548 If you additionally want to validate the document, use this:: 1549 1550 >>> parser = XHTMLParser(dtd_validation=True) 1551 1552 For catalog support, see http://www.xmlsoft.org/catalog.html. 1553 """
1554 - def __init__(self, **kwargs):
1555 super(XHTMLParser, self).__init__(**kwargs) 1556 self.set_element_class_lookup(HtmlElementClassLookup())
1557
1558 -def Element(*args, **kw):
1559 """Create a new HTML Element. 1560 1561 This can also be used for XHTML documents. 1562 """ 1563 v = html_parser.makeelement(*args, **kw) 1564 return v
1565 1566 html_parser = HTMLParser() 1567 xhtml_parser = XHTMLParser() 1568