Source code for pdf12step.client

import re
import requests
import os

from pdf12step.cached import cached_property
from pdf12step.config import DATA_DIR
from pdf12step.utils import csv_dump, json_dump
from pdf12step.log import logger


DEFAULTS = {
    'mode': 'search',
}
NONCE_RE = re.compile('nonce":"([0-9a-fA-F]+)"')
HEADERS = {
    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.127 Safari/537.36'
}


[docs]class Client(object): """ Client that makes HTTP[S] calls to the WP site and fetches the data :param str url: Base URL of the WP site to gather data from """ sections = ('meetings',) # 'locations', 'groups', 'regions') these arent necessary for now nonce_url = api_url = None def __init__(self, site_url, api_url, nonce_url=None, api_key=None): if not site_url: raise ValueError('Site URL required, please set site_url in your config') if not api_url: raise ValueError('API URL required, please set api_url in your config') self.site_url = site_url = site_url.rstrip('/') self.api_key = api_key if nonce_url: self.nonce_url = nonce_url if nonce_url.startswith('http') else f'{site_url}/{nonce_url}' if api_url: self.api_url = api_url if api_url.startswith('http') else f'{site_url}/{api_url}' @cached_property def nonce(self): """ Fetches the nonce on a base page to use in subsequent requests to the WP site Bypasses WP CSRF protection :rtype: str """ response = requests.get(self.nonce_url, headers=HEADERS) response.raise_for_status() content = response.content.decode() match = NONCE_RE.search(content, re.M) if match: return match.groups()[0] def _dispatch(self, method, url, *args, **kwargs): if not url.startswith('http'): url = f'{self.site_url}/{url}' logger.info(f'{method.upper()} {url} {args}') method = getattr(requests, method) kwargs['headers'] = HEADERS response = method(url, *args, **kwargs) if response.status_code != 200: logger.error(f'Bad response: {response.content}') response.raise_for_status() logger.info(f'GOT {len(response.content)}B {response.headers["Content-Type"].split(";")[0]} in {response.elapsed}') return response.json()
[docs] def get(self, *args, **kwargs): """Returns a GET request to the given resource""" return self._dispatch('get', *args, **kwargs)
[docs] def post(self, *args, **kwargs): """Returns a POST request to the given resource""" return self._dispatch('post', *args, **kwargs)
[docs] def tsml(self, entity, params=None): """ Returns and loads the data from the named entity TSML endpoint :param str entity: Name of the entity to load (eg meetings/locations) :rtype: list """ if params is None: params = {} params['action'] = f'tsml_{entity}' return self.get(self.api_url, params)
[docs] def meetings(self, **params): """ Returns meeting data with the given query params :param dict params: Query parameters to use in GET request :rtype: list """ data = DEFAULTS.copy() data.update(params, action='meetings') if self.api_key: data['key'] = self.api_key if self.nonce_url: data['nonce'] = self.nonce return self.get(self.api_url, data) return self.tsml('meetings')
[docs] def locations(self): """ Loads locations TSML endpoint data :rtype: list """ return self.tsml('locations')
[docs] def groups(self): """ Loads groups TSML endpoint data :rtype: list """ return self.tsml('groups')
[docs] def regions(self): """ Loads regions TSML endpoint data :rtype: list """ return self.tsml('regions')
[docs] def download(self, sections=None, format='json', data_dir=DATA_DIR, prefix=None): """ Downloads all the TSML endpoints meeting data to the DATA_DIR destination. :param tuple sections: Specific sections to download (eg meetings) :param str format: Which format to load the data in (eg json/csv) """ if sections is None: sections = self.sections if not os.path.exists(data_dir): logger.warn(f'data dir not found, creating: {data_dir}') os.makedirs(data_dir) sections = self.sections if not sections else sections for section in sections: if not hasattr(self, section): raise ValueError(f'Section {section} not known') data = getattr(self, section)() fname = f'{prefix}-{section}.{format}' if prefix else f'{section}.{format}' outfile = os.path.join(data_dir, fname) json_dump(data, outfile) if format == 'json' else csv_dump(data, outfile) logger.info(f'Downloaded {outfile}')