Source code for openwpm.commands.utils.XPathUtil

# XPathUtil.py
# A collecton of utilities to extract and parse
# XPaths encountered while scraping.
#
# Steven Englehardt (github.com/englehardt)


import re

import bs4
from bs4 import BeautifulSoup as bs


[docs] def is_clickable(xpath): # We consider any xpath that has an 'a', 'button', # or 'input' tag to be clickable as it most likely # contains a link. It may make sense to see check # <input type="button"> or other tags... index_regex = re.compile(r"\[[^\]]*\]") # match index and id brackets # check xpath for necessary tags temp = re.sub(index_regex, "", xpath) temp = temp.split("/") if "a" in temp or "button" in temp or "input" in temp: return True return False
# ExtractXPath(element, use_id) # - element: a bs4 tag node # - use_id: defaults True # # Traverses up the tag tree of a Beautiful Soup node # to return the XPath of that node. # # Use of ids is preferred when the xpath will be used # outside of BeautifulSoup. Since an id is unique to # all elements of the tree, it allows the use of a # wildcard for all parent nodes. This minimizes the # chances of incorrect indexing (which can occur if # javascript changes a page during processing).
[docs] class ExtractXPathError(Exception): def __init__(self, value): self.value = value def __str__(self): return repr(self.value)
[docs] def check_previous_tags(node, use_id=True): # index of node counter = 1 for tag in node.previous_siblings: if type(tag) != bs4.element.Tag: continue elif tag.name == node.name: counter += 1 # XPath name if counter > 1: xpath = node.name + "[%d]" % counter else: xpath = node.name return xpath
[docs] def ExtractXPath(element, use_id=True): # Check that element is a tag node if type(element) != bs4.element.Tag: raise ExtractXPathError( "%s is not a supported data type. " "Only tag nodes from the tag tree are accepted." % type(element) ) # Starting node # Check id first if use_id and element.get("id") is not None: return "//*/" + element.name + '[@id="' + element.get("id") + '"]' xpath = check_previous_tags(element) # Parent Nodes for parent in element.parents: # End of XPath - exclude from string if parent.name == "[document]": break # Check id first if use_id and parent.get("id") is not None: return ( "//*/" + parent.name + '[@id="' + parent.get("id") + '"]/' + xpath ) # noqa xpath = check_previous_tags(parent) + "/" + xpath xpath = "/" + xpath return xpath
# xp1_wildcard adds wildcard functionality to XPath 1.0 # strings using the limited function set supported by the 1.0 # implementation. # # xp1_lowercase likewise adds lowercase functionality # # Hopefully you never need these...
[docs] def xp1_lowercase(string): return ( "translate(" + string + ", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')" ) # noqa
# Converts a string with a wildcard in it to an XPath 1.0 # compatible string *** ONLY SUPPORTS 1 WILDCARD *** # string: string w/ wildcard that you are searching for # attr: tag attribute you are searching for (e.g. 'text()' or '@id' or ...)
[docs] def xp1_wildcard(attr, string, normalize=True): parts = string.split("*") if normalize: attr = "normalize-space(" + attr + ")" if len(parts) != 2: print("ERROR: This function is meant to support 1 wildcard") return "[" + attr + "=" + string + "]" else: pt1 = "" pt2 = "" if parts[0] != "": pt1 = "starts-with(" + attr + ", '" + parts[0] + "')" if parts[1] != "": pt2 = ( "contains(substring(" + attr + ", string-length(" + attr + ")-" + str(len(parts[1]) - 1) + "), '" + parts[1] + "')" ) if pt1 == "" and pt2 != "": return "[" + pt2 + "]" elif pt1 != "" and pt2 == "": return "[" + pt1 + "]" elif pt1 != "" and pt2 != "": return "[" + pt1 + " and " + pt2 + "]" else: print("ERROR: The string is empty") return "[" + attr + "=" + string + "]"
[docs] def main(): # Output some sample XPaths print("--- Sample XPaths ---") import re from random import choice from urllib.request import urlopen rsp = urlopen("http://www.reddit.com/") if rsp.getcode() == 200: soup = bs(rsp.read(), "lxml") elements = soup.findAll(text=re.compile("[A-Za-z0-9]{10,}")) for i in range(0, 5): element = choice(elements).parent print("HTML") print(element) print("XPath") print(ExtractXPath(element)) print("**************")
if __name__ == "__main__": main()