from lxml import etree content = '... some html ...' # use the HTML parser explicitly to provide encoding parser = etree.HTMLParser(encoding='utf-8') # load the content using the parser tree = etree.fromstring(content, parser) # we've got a XML tree from HTML # now get all links in the doc links = tree.xpath(".//*/a") for link in links: href = link.get('href') # get tag's attribute name = link.text() # text between open and close tags
Some links:
API reference
Usage tutorial