Source code for openwpm.commands.utils.XPathUtil

# XPathUtil.py
# A collecton of utilities to extract and parse
# XPaths encountered while scraping.
#
# Steven Englehardt (github.com/englehardt)


import re

import bs4
from bs4 import BeautifulSoup as bs



[docs]
def is_clickable(xpath):
    # We consider any xpath that has an 'a', 'button',
    # or 'input' tag to be clickable as it most likely
    # contains a link. It may make sense to see check
    # <input type="button"> or other tags...
    index_regex = re.compile(r"\[[^\]]*\]")  # match index and id brackets
    # check xpath for necessary tags
    temp = re.sub(index_regex, "", xpath)
    temp = temp.split("/")
    if "a" in temp or "button" in temp or "input" in temp:
        return True
    return False



# ExtractXPath(element, use_id)
# - element: a bs4 tag node
# - use_id: defaults True
#
# Traverses up the tag tree of a Beautiful Soup node
# to return the XPath of that node.
#
# Use of ids is preferred when the xpath will be used
# outside of BeautifulSoup. Since an id is unique to
# all elements of the tree, it allows the use of a
# wildcard for all parent nodes. This minimizes the
# chances of incorrect indexing (which can occur if
# javascript changes a page during processing).



[docs]
class ExtractXPathError(Exception):
    def __init__(self, value):
        self.value = value

    def __str__(self):
        return repr(self.value)




[docs]
def check_previous_tags(node, use_id=True):
    # index of node
    counter = 1
    for tag in node.previous_siblings:
        if type(tag) != bs4.element.Tag:
            continue
        elif tag.name == node.name:
            counter += 1

    # XPath name
    if counter > 1:
        xpath = node.name + "[%d]" % counter
    else:
        xpath = node.name

    return xpath




[docs]
def ExtractXPath(element, use_id=True):
    # Check that element is a tag node
    if type(element) != bs4.element.Tag:
        raise ExtractXPathError(
            "%s is not a supported data type. "
            "Only tag nodes from the tag tree are accepted." % type(element)
        )

    # Starting node
    # Check id first
    if use_id and element.get("id") is not None:
        return "//*/" + element.name + '[@id="' + element.get("id") + '"]'

    xpath = check_previous_tags(element)

    # Parent Nodes
    for parent in element.parents:
        # End of XPath - exclude from string
        if parent.name == "[document]":
            break

        # Check id first
        if use_id and parent.get("id") is not None:
            return (
                "//*/" + parent.name + '[@id="' + parent.get("id") + '"]/' + xpath
            )  # noqa

        xpath = check_previous_tags(parent) + "/" + xpath

    xpath = "/" + xpath
    return xpath



# xp1_wildcard adds wildcard functionality to XPath 1.0
# strings using the limited function set supported by the 1.0
# implementation.
#
# xp1_lowercase likewise adds lowercase functionality
#
# Hopefully you never need these...



[docs]
def xp1_lowercase(string):
    return (
        "translate("
        + string
        + ", 'ABCDEFGHIJKLMNOPQRSTUVWXYZ', 'abcdefghijklmnopqrstuvwxyz')"
    )  # noqa



# Converts a string with a wildcard in it to an XPath 1.0
# compatible string *** ONLY SUPPORTS 1 WILDCARD ***
# string: string w/ wildcard that you are searching for
# attr: tag attribute you are searching for (e.g. 'text()' or '@id' or ...)



[docs]
def xp1_wildcard(attr, string, normalize=True):
    parts = string.split("*")

    if normalize:
        attr = "normalize-space(" + attr + ")"

    if len(parts) != 2:
        print("ERROR: This function is meant to support 1 wildcard")
        return "[" + attr + "=" + string + "]"
    else:
        pt1 = ""
        pt2 = ""

        if parts[0] != "":
            pt1 = "starts-with(" + attr + ", '" + parts[0] + "')"
        if parts[1] != "":
            pt2 = (
                "contains(substring("
                + attr
                + ", string-length("
                + attr
                + ")-"
                + str(len(parts[1]) - 1)
                + "), '"
                + parts[1]
                + "')"
            )

        if pt1 == "" and pt2 != "":
            return "[" + pt2 + "]"
        elif pt1 != "" and pt2 == "":
            return "[" + pt1 + "]"
        elif pt1 != "" and pt2 != "":
            return "[" + pt1 + " and " + pt2 + "]"
        else:
            print("ERROR: The string is empty")
            return "[" + attr + "=" + string + "]"




[docs]
def main():
    # Output some sample XPaths
    print("--- Sample XPaths ---")
    import re
    from random import choice
    from urllib.request import urlopen

    rsp = urlopen("http://www.reddit.com/")
    if rsp.getcode() == 200:
        soup = bs(rsp.read(), "lxml")
        elements = soup.findAll(text=re.compile("[A-Za-z0-9]{10,}"))
        for i in range(0, 5):
            element = choice(elements).parent
            print("HTML")
            print(element)
            print("XPath")
            print(ExtractXPath(element))
            print("**************")



if __name__ == "__main__":
    main()
Source code for openwpm.commands.utils.XPathUtil

OpenWPM

Navigation

Related Topics