from lxml import etree
content = '... some html ...'
# use the HTML parser explicitly to provide encoding
parser = etree.HTMLParser(encoding='utf-8')
# load the content using the parser
tree = etree.fromstring(content, parser)
# we've got a XML tree from HTML
# now get all links in the doc
links = tree.xpath(".//*/a")
for link in links:
href = link.get('href') # get tag's attribute
name = link.text() # text between open and close tags
Some links:
API reference
Usage tutorial