Source code for pdf12step.client

import re
import requests
import os

from pdf12step.cached import cached_property
from pdf12step.config import DATA_DIR
from pdf12step.utils import csv_dump, json_dump
from pdf12step.log import logger


DEFAULTS = {
    'mode': 'search',
}
NONCE_RE = re.compile('nonce":"([0-9a-fA-F]+)"')
HEADERS = {
    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
}


[docs]class Client(object):
    """
    Client that makes HTTP[S] calls to the WP site and fetches the data

    :param str url: Base URL of the WP site to gather data from
    """
    sections = ('meetings',)  # 'locations', 'groups', 'regions') these arent necessary for now
    nonce_url = api_url = None

    def __init__(self, site_url, api_url, nonce_url=None, api_key=None):
        if not site_url:
            raise ValueError('Site URL required, please set site_url in your config')
        if not api_url:
            raise ValueError('API URL required, please set api_url in your config')
        self.site_url = site_url = site_url.rstrip('/')
        self.api_key = api_key
        if nonce_url:
            self.nonce_url = nonce_url if nonce_url.startswith('http') else f'{site_url}/{nonce_url}'
        if api_url:
            self.api_url = api_url if api_url.startswith('http') else f'{site_url}/{api_url}'

    @cached_property
    def nonce(self):
        """
        Fetches the nonce on a base page to use in subsequent requests to the WP site
        Bypasses WP CSRF protection

        :rtype: str
        """
        response = requests.get(self.nonce_url, headers=HEADERS)
        response.raise_for_status()
        content = response.content.decode()
        match = NONCE_RE.search(content, re.M)
        if match:
            return match.groups()[0]

    def _dispatch(self, method, url, *args, **kwargs):
        if not url.startswith('http'):
            url = f'{self.site_url}/{url}'
        logger.info(f'{method.upper()} {url} {args}')
        method = getattr(requests, method)
        kwargs['headers'] = HEADERS
        response = method(url, *args, **kwargs)
        if response.status_code != 200:
            logger.error(f'Bad response: {response.content}')
        response.raise_for_status()
        logger.info(f'GOT {len(response.content)}B {response.headers["Content-Type"].split(";")[0]} in {response.elapsed}')
        return response.json()

[docs]    def get(self, *args, **kwargs):
        """Returns a GET request to the given resource"""
        return self._dispatch('get', *args, **kwargs)

[docs]    def post(self, *args, **kwargs):
        """Returns a POST request to the given resource"""
        return self._dispatch('post', *args, **kwargs)

[docs]    def tsml(self, entity, params=None):
        """
        Returns and loads the data from the named entity TSML endpoint

        :param str entity: Name of the entity to load (eg meetings/locations)
        :rtype: list
        """
        if params is None:
            params = {}
        params['action'] = f'tsml_{entity}'
        return self.get(self.api_url, params)

[docs]    def meetings(self, **params):
        """
        Returns meeting data with the given query params

        :param dict params: Query parameters to use in GET request
        :rtype: list
        """
        data = DEFAULTS.copy()
        data.update(params, action='meetings')
        if self.api_key:
            data['key'] = self.api_key
        if self.nonce_url:
            data['nonce'] = self.nonce
            return self.get(self.api_url, data)
        return self.tsml('meetings')

[docs]    def locations(self):
        """
        Loads locations TSML endpoint data

        :rtype: list
        """
        return self.tsml('locations')

[docs]    def groups(self):
        """
        Loads groups TSML endpoint data

        :rtype: list
        """
        return self.tsml('groups')

[docs]    def regions(self):
        """
        Loads regions TSML endpoint data

        :rtype: list
        """
        return self.tsml('regions')

[docs]    def download(self, sections=None, format='json', data_dir=DATA_DIR, prefix=None):
        """
        Downloads all the TSML endpoints meeting data to the DATA_DIR destination.

        :param tuple sections: Specific sections to download (eg meetings)
        :param str format: Which format to load the data in (eg json/csv)
        """
        if sections is None:
            sections = self.sections
        if not os.path.exists(data_dir):
            logger.warn(f'data dir not found, creating: {data_dir}')
            os.makedirs(data_dir)
        sections = self.sections if not sections else sections
        for section in sections:
            if not hasattr(self, section):
                raise ValueError(f'Section {section} not known')
            data = getattr(self, section)()
            fname = f'{prefix}-{section}.{format}' if prefix else f'{section}.{format}'
            outfile = os.path.join(data_dir, fname)
            json_dump(data, outfile) if format == 'json' else csv_dump(data, outfile)
            logger.info(f'Downloaded {outfile}')
Source code for pdf12step.client

Navigation

Related Topics