Source code for osos.osos

"""
osos base class.
"""
import datetime
import os
import pandas as pd
import logging
from warnings import warn
from osos.api_github import Github
from osos.api_pypi import Pypi
from osos.api_conda import Conda


OSOS_DIR = os.path.dirname(os.path.realpath(__file__))
DATA_DIR = os.path.join(os.path.dirname(OSOS_DIR), 'data')

logger = logging.getLogger(__name__)


[docs]class Osos: """Base class to handle open source ops and statistics (osos) for a given repo/package. """ def __init__(self, git_owner, git_repo, pypi_name=None, conda_org=None, conda_name=None): """ Parameters ---------- git_owner : str Github repository owner, e.g. https://github.com/{owner}/{repo}. Case insensitive. git_repo : str Github repository name, e.g. https://github.com/{owner}/{repo}. Case insensitive. pypi_name : str | None pypi package name. Note that this should include the prefix for nrel packages e.g. reV -> nrel-rev. This can be None if there is no pypi package. Case insensitive. conda_org : str Conda organization name, for example: https://anaconda.org/{org}/{name}. Case insensitive. conda_name : str Conda package name, for example: https://anaconda.org/{org}/{name}. Case insensitive. """ self._git_owner = git_owner self._git_repo = git_repo self._pypi_name = pypi_name self._conda_org = conda_org self._conda_name = conda_name self._gh = Github(self._git_owner, self._git_repo) d0 = datetime.date.today() d1 = datetime.date.today() - datetime.timedelta(days=13) self._index = pd.date_range(d1, d0, freq='1D').date
[docs] def clean_table(self, table): """Fill nan values and make sure the timeseries index has 14 days. Parameters ---------- table : pd.DataFrame Repository usage and statistics table with datetime.date index for the last two weeks including today and columns for various github and pypi usage metrics. Returns ------- table : pd.DataFrame Repository usage and statistics table with datetime.date index for the last two weeks including today and columns for various github and pypi usage metrics. """ table = table.reindex(self._index) timeseries_cols = ['clones', 'clones_unique', 'views', 'views_unique', 'commits', 'pypi_daily'] timeseries_cols = [c for c in table.columns if c in timeseries_cols] other_cols = [c for c in table.columns if c not in timeseries_cols] table[timeseries_cols] = table[timeseries_cols].fillna(0) table[other_cols] = table[other_cols].ffill().bfill() table[other_cols] = table[other_cols].fillna(0) return table
[docs] def make_table(self): """Make the usage and statistics table for the last two weeks. Returns ------- table : pd.DataFrame Repository usage and statistics table with datetime.date index for the last two weeks including today and columns for various github and pypi usage metrics. """ logger.info('Collecting data for: ' f'"{self._git_owner}/{self._git_repo}"') table = pd.DataFrame(index=self._index) try: table = table.join(self._gh.clones()) table = table.join(self._gh.views()) except OSError: msg = ('Could not get github clone/views data from ' f'"{self._git_owner}/{self._git_repo}", ' 'try setting a GITHUB_TOKEN with push permissions.') warn(msg) logger.warning(msg) issues_pulls = (self._gh.issues_closed(), self._gh.issues_open(), self._gh.pulls_closed(), self._gh.pulls_open()) options = (('issues', 'closed'), ('issues', 'open'), ('pulls', 'closed'), ('pulls', 'open')) for ip_count, (op1, op2) in zip(issues_pulls, options): table[f'{op1}_{op2}'] = ip_count table['forks'] = self._gh.forks() table['stargazers'] = self._gh.stargazers() table['subscribers'] = self._gh.subscribers() table['contributors'] = self._gh.contributors() table = table.join(self._gh.commits(date_iter=self._index)) table['total_commits'] = self._gh.commit_count() if self._pypi_name is not None: pypi_out = Pypi.get_daily_data(self._pypi_name, table.index.values) table = table.join(pypi_out) if self._conda_org is not None and self._conda_name is not None: conda_out = Conda.get_data(self._conda_org, self._conda_name) table['conda_total_downloads'] = conda_out table['updated_on'] = datetime.date.today() table = self.clean_table(table) return table
[docs] def update(self, fpath_out): """Update and save the fpath_out file. The current update data will be used if there are duplicates. Parameters ---------- fpath_out : str Output file to save the osos output table. If the file exists, it will be updated with the latest data. This path can include the keywords "DATA_DIR" and "NAME" which will get replaced by the system location of the /osos/data/ directory and the github repo name, respectively. Returns ------- table : pd.DataFrame osos table including the original data from fpath_out (if exists) updated with the currently available data from github and pypi. This is also saved to fpath_out. """ fpath_out = fpath_out.replace('DATA_DIR', DATA_DIR) fpath_out = fpath_out.replace('NAME', self._git_repo) table = self.make_table() if os.path.exists(fpath_out): logger.info(f'Updating cached file: {fpath_out}') original = pd.read_csv(fpath_out, index_col=0) original.index = pd.to_datetime(original.index.values).date table = pd.concat([table, original]) table = table[~table.index.duplicated(keep='first')] table = table.sort_index() logger.info(f'Saved osos output to: {fpath_out}') table.to_csv(fpath_out) return table
[docs] @classmethod def run_config(cls, config): """Run multiple osos jobs from a csv config. Parameters ---------- config : str Path to .csv config file with columns for git_owner, git_repo, fpath_out, and (optionally) pypi_name, conda_org, and conda_name. """ assert os.path.exists(config), 'config must be a valid filepath' assert config.endswith('.csv'), 'config must be .csv' config = pd.read_csv(config) required = ('git_owner', 'git_repo', 'fpath_out') missing = [r for r in required if r not in config] if any(missing): msg = f'Config had missing required columns: {missing}' logger.error(msg) raise KeyError(msg) for _, row in config.iterrows(): row = row.to_dict() conda_org = row.get('conda_org', None) conda_name = row.get('conda_name', None) pypi_name = row.get('pypi_name', None) conda_org = conda_org if isinstance(conda_org, str) else None conda_name = conda_name if isinstance(conda_name, str) else None pypi_name = pypi_name if isinstance(pypi_name, str) else None fpath_out = row['fpath_out'].replace('DATA_DIR', DATA_DIR) fpath_out = row['fpath_out'].replace('NAME', row['git_repo']) osos = cls(row['git_owner'], row['git_repo'], pypi_name=pypi_name, conda_org=conda_org, conda_name=conda_name) osos.update(fpath_out)