Source code for openwpm.commands.utils.XPathUtil
# XPathUtil.py
# A collecton of utilities to extract and parse
# XPaths encountered while scraping.
#
# Steven Englehardt (github.com/englehardt)
import re
import bs4
from bs4 import BeautifulSoup as bs
[docs]
def is_clickable(xpath):
# We consider any xpath that has an 'a', 'button',
# or 'input' tag to be clickable as it most likely
# contains a link. It may make sense to see check
# <input type="button"> or other tags...
index_regex = re.compile(r"\[[^\]]*\]") # match index and id brackets
# check xpath for necessary tags
temp = re.sub(index_regex, "", xpath)
temp = temp.split("/")
if "a" in temp or "button" in temp or "input" in temp:
return True
return False
# ExtractXPath(element, use_id)
# - element: a bs4 tag node
# - use_id: defaults True
#
# Traverses up the tag tree of a Beautiful Soup node
# to return the XPath of that node.
#
# Use of ids is preferred when the xpath will be used
# outside of BeautifulSoup. Since an id is unique to
# all elements of the tree, it allows the use of a
# wildcard for all parent nodes. This minimizes the
# chances of incorrect indexing (which can occur if
# javascript changes a page during processing).
# xp1_wildcard adds wildcard functionality to XPath 1.0
# strings using the limited function set supported by the 1.0
# implementation.
#
# xp1_lowercase likewise adds lowercase functionality
#
# Hopefully you never need these...
[docs]
def xp1_lowercase(string):
return (
"translate("
+ string
+ ", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')"
) # noqa
# Converts a string with a wildcard in it to an XPath 1.0
# compatible string *** ONLY SUPPORTS 1 WILDCARD ***
# string: string w/ wildcard that you are searching for
# attr: tag attribute you are searching for (e.g. 'text()' or '@id' or ...)
[docs]
def xp1_wildcard(attr, string, normalize=True):
parts = string.split("*")
if normalize:
attr = "normalize-space(" + attr + ")"
if len(parts) != 2:
print("ERROR: This function is meant to support 1 wildcard")
return "[" + attr + "=" + string + "]"
else:
pt1 = ""
pt2 = ""
if parts[0] != "":
pt1 = "starts-with(" + attr + ", '" + parts[0] + "')"
if parts[1] != "":
pt2 = (
"contains(substring("
+ attr
+ ", string-length("
+ attr
+ ")-"
+ str(len(parts[1]) - 1)
+ "), '"
+ parts[1]
+ "')"
)
if pt1 == "" and pt2 != "":
return "[" + pt2 + "]"
elif pt1 != "" and pt2 == "":
return "[" + pt1 + "]"
elif pt1 != "" and pt2 != "":
return "[" + pt1 + " and " + pt2 + "]"
else:
print("ERROR: The string is empty")
return "[" + attr + "=" + string + "]"
[docs]
def main():
# Output some sample XPaths
print("--- Sample XPaths ---")
import re
from random import choice
from urllib.request import urlopen
rsp = urlopen("http://www.reddit.com/")
if rsp.getcode() == 200:
soup = bs(rsp.read(), "lxml")
elements = soup.findAll(text=re.compile("[A-Za-z0-9]{10,}"))
for i in range(0, 5):
element = choice(elements).parent
print("HTML")
print(element)
print("XPath")
print(ExtractXPath(element))
print("**************")
if __name__ == "__main__":
main()