Source code for openwpm.config

import tempfile
from dataclasses import dataclass, field
from json import JSONEncoder
from pathlib import Path
from typing import Any, Dict, List, Literal, Optional, Tuple, Union

from dataclasses_json import DataClassJsonMixin
from dataclasses_json import config as DCJConfig

from .errors import ConfigError
from .types import BrowserId

BOOL_TYPE_VALIDATION_LIST = [True, False]
DISPLAY_MODE_VALIDATION_LIST = ["native", "headless", "xvfb"]
SUPPORTED_BROWSER_LIST = [
    "firefox"
]  # Using List instead of a str type to future proof the logic as OpenWPM may add support for more browsers in future
TP_COOKIES_OPTIONALS_LIST = ["always", "never", "from_visited"]
LOG_EXTENSION_TYPE_LIST = [".log"]
CONFIG_ERROR_STRING = (
    "Found {value} as value for {parameter_name} in BrowserParams. "
    "Supported values are {value_list}. Please look at "
    "docs/Configuration.md#browser-configuration-options for more information"
)
EXTENSION_ERROR_STRING = (
    "Found {extension} extension for {parameter_name} in ManagerParams "
    "supported extensions are {value_list}. Please look at "
    "docs/Configuration.md#platform-configuration-options for more information"
)
GENERAL_ERROR_STRING = (
    "Found invalid value `{value}` for {parameter_name} in {params_type}. "
    "Please look at docs/Configuration.md for more information"
)

ALL_RESOURCE_TYPES = {
    "beacon",
    "csp_report",
    "font",
    "image",
    "imageset",
    "main_frame",
    "media",
    "object",
    "object_subrequest",
    "ping",
    "script",
    "stylesheet",
    "sub_frame",
    "web_manifest",
    "websocket",
    "xml_dtd",
    "xmlhttprequest",
    "xslt",
    "other",
}


[docs] def str_to_path(string: Optional[str]) -> Optional[Path]: if string is not None: return Path(string) return None
[docs] def path_to_str(path: Optional[Path]) -> Optional[str]: if path is not None: return str(path.resolve()) return None
[docs] @dataclass class BrowserParams(DataClassJsonMixin): """ Configuration that might differ per browser OpenWPM allows you to run multiple browsers with different configurations in parallel and this class allows you to customize behaviour of an individual browser """ cookie_instrument: bool = True js_instrument: bool = False js_instrument_settings: List[Union[str, dict]] = field( default_factory=lambda: ["collection_fingerprinting"] ) http_instrument: bool = False navigation_instrument: bool = False save_content: Union[bool, str] = False callstack_instrument: bool = False dns_instrument: bool = False seed_tar: Optional[Path] = field( default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path) ) display_mode: Literal["native", "headless", "xvfb"] = "native" browser: str = "firefox" prefs: dict = field(default_factory=dict) tp_cookies: str = "always" bot_mitigation: bool = False profile_archive_dir: Optional[Path] = field( default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path) ) tmp_profile_dir: Path = field( default=Path(tempfile.gettempdir()), metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path), ) """ The tmp_profile_dir defaults to the OS's temporary file folder (typically /tmp) and is where the generated browser profiles and residual files are stored. """ maximum_profile_size: Optional[int] = None """ The total amount of on disk space the generated browser profiles and residual files are allowed to consume in bytes. If this option is not set, no checks will be performed Rationale --------- This option can serve as a happy medium between killing a browser after each crawl and allowing the application to still perform quickly. Used as a way to save space in a limited environment with minimal detriment to speed. If the maximum_profile_size is exceeded after a CommandSequence is completed, the browser will be shut down and a new one will be created. **Even with this setting you may temporarily have more disk usage than the sum of all maximum_profile_sizes** However, this will also ensure that a CommandSequence is allowed to complete without undue interruptions. Sample values ------------- * 1073741824: 1GB * 20971520: 20MB - for testing purposes * 52428800: 50MB * 73400320: 70MB * 104857600: 100MB - IDEAL for 10+ browsers """ recovery_tar: Optional[Path] = None donottrack: bool = False tracking_protection: bool = False custom_params: Dict[Any, Any] = field(default_factory=lambda: {})
[docs] @dataclass class ManagerParams(DataClassJsonMixin): """ Configuration for the TaskManager The configuration will be the same for all browsers running on the same TaskManager. It can be used to control storage locations or which watchdogs should run """ data_directory: Path = field( default=Path.home() / "openwpm", metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path), ) """The directory into which screenshots and page dumps will be saved""" log_path: Path = field( default=Path.home() / "openwpm" / "openwpm.log", metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path), ) """The path to the file in which OpenWPM will log. The directory given will be created if it does not exist.""" testing: bool = False """A platform wide flag that can be used to only run certain functionality while testing. For example, the Javascript instrumentation""" memory_watchdog: bool = False """A watchdog that tries to ensure that no Firefox instance takes up too much memory. It is mostly useful for long running cloud crawls""" process_watchdog: bool = False """It is used to create another thread that kills off `GeckoDriver` (or `Xvfb`) instances that haven't been spawned by OpenWPM. (GeckoDriver is used by Selenium to control Firefox and Xvfb a "virtual display" so we simulate having graphics when running on a server). """ num_browsers: int = 1 _failure_limit: Optional[int] = None """The number of command failures the platform will tolerate before raising a `CommandExecutionError` exception. Otherwise the default is set to 2 x the number of browsers plus 10. The failure counter is reset at the end of each successfully completed command sequence. For non-blocking command sequences that cause the number of failures to exceed `failure_limit` the `CommandExecutionError` is raised when attempting to execute the next command sequence.""" @property def failure_limit(self) -> int: if self._failure_limit is None: return 2 * self.num_browsers + 10 return self._failure_limit @failure_limit.setter def failure_limit(self, value: int) -> None: self._failure_limit = value
[docs] @dataclass class BrowserParamsInternal(BrowserParams): browser_id: Optional[BrowserId] = None profile_path: Optional[Path] = None cleaned_js_instrument_settings: Optional[List[Dict[str, Any]]] = None
[docs] @dataclass class ManagerParamsInternal(ManagerParams): storage_controller_address: Optional[Tuple[str, int]] = None logger_address: Optional[Tuple[str, ...]] = None screenshot_path: Optional[Path] = field( default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path) ) source_dump_path: Optional[Path] = field( default=None, metadata=DCJConfig(encoder=path_to_str, decoder=str_to_path) )
[docs] def validate_browser_params(browser_params: BrowserParams) -> None: if BrowserParams() == browser_params: return try: if browser_params.display_mode.lower() not in DISPLAY_MODE_VALIDATION_LIST: raise ConfigError( CONFIG_ERROR_STRING.format( value=browser_params.display_mode, value_list=DISPLAY_MODE_VALIDATION_LIST, parameter_name="display_mode", ) ) if browser_params.browser.lower() not in SUPPORTED_BROWSER_LIST: raise ConfigError( CONFIG_ERROR_STRING.format( value=browser_params.browser, value_list=SUPPORTED_BROWSER_LIST, parameter_name="browser", ) ) if browser_params.tp_cookies.lower() not in TP_COOKIES_OPTIONALS_LIST: raise ConfigError( CONFIG_ERROR_STRING.format( value=browser_params.tp_cookies, value_list=TP_COOKIES_OPTIONALS_LIST, parameter_name="tp_cookies", ) ) if browser_params.callstack_instrument: raise ConfigError( "The callstacks instrument currently doesn't work " "as it is requires intricate machinery that broke " "in one of the previous Firefox versions." ) if browser_params.callstack_instrument and not browser_params.js_instrument: raise ConfigError( "The callstacks instrument currently doesn't work without " "the JS instrument enabled. see: " "https://github.com/openwpm/OpenWPM/issues/557" ) if not isinstance(browser_params.save_content, bool) and not isinstance( browser_params.save_content, str ): raise ConfigError( GENERAL_ERROR_STRING.format( value=browser_params.save_content, parameter_name="save_content", params_type="BrowserParams", ) ) if browser_params.save_content: if isinstance(browser_params.save_content, str): configured_types = set(browser_params.save_content.split(",")) if not configured_types.issubset(ALL_RESOURCE_TYPES): diff = configured_types.difference(ALL_RESOURCE_TYPES) raise ConfigError( "Unrecognized resource types provided ", "in browser_params.save_content (%s)" % diff, ) except: raise ConfigError( "Something went wrong while validating BrowserParams. " "Please check values provided for BrowserParams are of expected types" )
[docs] def validate_manager_params(manager_params: ManagerParams) -> None: if ManagerParams() == manager_params: return try: log_file_extension = manager_params.log_path.suffix if log_file_extension.lower() not in LOG_EXTENSION_TYPE_LIST: raise ConfigError( EXTENSION_ERROR_STRING.format( extension=log_file_extension or "no", value_list=LOG_EXTENSION_TYPE_LIST, parameter_name="log_file", ) ) except (TypeError, AttributeError): raise ConfigError( GENERAL_ERROR_STRING.format( value=manager_params.log_path, parameter_name="log_file", params_type="ManagerParams", ) ) # This check is necessary to not cause any internal error if not isinstance(manager_params.failure_limit, int): raise ConfigError( GENERAL_ERROR_STRING.format( value=manager_params.failure_limit, parameter_name="failure_limit", params_type="ManagerParams", ).replace( "Please look at docs/Configuration.md for more information", "failure_limit must be of type `int` or `None`", ) )
[docs] def validate_crawl_configs( manager_params: ManagerParams, browser_params: List[BrowserParams] ) -> None: validate_manager_params(manager_params) for bp in browser_params: validate_browser_params(bp) if len(browser_params) != manager_params.num_browsers: raise ConfigError( "Number of BrowserParams instances is not the same " "as manager_params.num_browsers. Make sure you are assigning number of browsers " "to be used to manager_params.num_browsers in your entry file" )
[docs] class ConfigEncoder(JSONEncoder):
[docs] def default(self, obj): if isinstance(obj, Path): return str(obj.resolve()) return JSONEncoder.default(self, obj)