Source code for elm.web.rhub

"""
Code to build Corpus from the researcher hub.
"""
import os
import os.path
import logging
import json
import math
import re
import requests
import pandas as pd

logger = logging.getLogger(__name__)



[docs]
class ProfilesRecord(dict):
    """Class to handle a single profiles as dictionary data.
    This class requires setting an 'RHUB_API_KEY' environment
    variable to access the Pure Web Service. The API key can be
    obtained by contacting an NREL library representative:
    Library@nrel.gov.
    """
    def __init__(self, record):
        """
        Parameters
        ----------
        record : dict
            Profile in dict form, typically a response from the API.
        """
        api_key = os.getenv("RHUB_API_KEY")
        assert api_key is not None, "Please set RHUB_API_KEY!"
        assert isinstance(record, dict)

        super().__init__(**record)


[docs]
    @staticmethod
    def clean_text(html_text):
        """Clean html text from API response

        Parameters
        ----------
        html_text : str
            Text containing html characters.
        Returns
        -------
        clean : str
            Text with html characters removed.
        """
        clean = re.sub(r'<.*?>', '', html_text)
        clean = clean.replace('\xa0', ' ')

        return clean


    @property
    def first_name(self):
        """Get the first name of this researcher.

        Returns
        -------
        first : str
            Full name of researcher.
        """
        names = self.get('name')
        first = names.get('firstName')

        return first

    @property
    def last_name(self):
        """Get the last name of this researcher.

        Returns
        -------
        last : str
            Last name of researcher.
        """
        names = self.get('name')
        last = names.get('lastName')

        return last

    @property
    def title(self):
        """Get the full name of this researcher.

        Returns
        -------
        full : str
            Full name of researcher.
        """
        names = self.get('name')
        first = names.get('firstName')
        last = names.get('lastName')
        full = first + ' ' + last

        return full

    @property
    def email(self):
        """Get the email address of this researcher.

        Returns
        -------
        email : str
            Email address of researcher.
        """
        email = None
        orgs = self.get('staffOrganisationAssociations')
        if orgs:
            emails_dict = orgs[0].get('emails')
            if emails_dict:
                email = emails_dict[0].get('value').get('value')

        return email

    @property
    def url(self):
        """Get the url or this researcher's profile.

        Returns
        -------
        url : str
            URL to researcher's profile.
        """
        info = self.get('info')
        url = info.get('portalUrl')

        return url

    @property
    def id(self):
        """Get API ID of researcher.

        Returns
        -------
        id : str
            Researcher ID.
        """
        level = self.get('ids')[0]
        id = level.get('value').get('value')

        return id

    @property
    def position(self):
        """Get the position of this researcher.

        Returns
        -------
        position : str
            Researcher's position.
        """
        position = None
        org = self.get('staffOrganisationAssociations')
        if org:
            info = org[0].get('jobDescription')
            text = info.get('text')[0]
            position = text.get('value')

        return position

    @property
    def profile_information(self):
        """Get key profile information for this record:
        Personal Profile, Research Interests, Professional Experience.

        Returns
        -------
        bio : str
            Researcher's profile text.
        interests : str
            Text from Research Interests section.
        experience : str
            Text from Professional Experience section.
        """
        prof = self.get('profileInformations')

        bio = None
        interests = None
        experience = None

        if prof:
            for section in prof:
                type = section.get('type').get('term')
                if 'Personal Profile' in str(type):
                    info = section.get('value').get('text')[0]
                    bio = info.get('value')
                    bio = self.clean_text(bio)

                if 'Research Interests' in str(type):
                    info = section.get('value').get('text')[0]
                    interests = info.get('value')
                    interests = self.clean_text(interests)

                if 'Professional Experience' in str(type):
                    info = section.get('value').get('text')[0]
                    experience = info.get('value')
                    experience = self.clean_text(experience)

        return bio, interests, experience

    @property
    def education(self):
        """Get the education information of this researcher.

        Returns
        -------
        levels : list
            Degree levels, ex: Master, Bachelor, PhD
        degs : list
            Area of study, ex: Mechanical Engineering
        schools : list
            School awarding the degree.
        """
        researcher_name = self.title
        edu = self.get('educations')
        out_strings = []

        if edu:
            for e in edu:
                try:
                    if e.get('projectTitle'):
                        quali = e.get('qualification')
                        level = quali.get('term').get('text')[0].get('value')
                        deg = e.get('projectTitle').get('text')[0].get('value')
                        org = e.get('organisationalUnits')

                        if org:
                            value = org[0].get('externalOrganisationalUnit')
                            name = value.get('name')
                            school = name.get('text')[0].get('value')
                        else:
                            deg_school = deg
                            deg = deg_school.split(',')[0]
                            school = deg_school.split(',')[1]

                        deg_string = (f'{researcher_name} has a {level} '
                                      f'degree in {deg} from {school}. ')
                        out_strings.append(deg_string)
                    else:
                        quali = e.get('qualification')
                        level = quali.get('term').get('text')[0].get('value')
                        org = e.get('organisationalUnits')[0]
                        org_unit = org.get('externalOrganisationalUnit')
                        name = org_unit.get('name')
                        school = name.get('text')[0].get('value')

                        deg_string = (f'{researcher_name} has a {level} '
                                      f'degree from {school}. ')
                        out_strings.append(deg_string)
                except Exception:
                    pass

            return out_strings

    @property
    def publications(self):
        """Get the publications this researcher contributed to.

        Returns
        -------
        pubs : list
            All publications associated with this researcher.
        """
        api_key = os.getenv("RHUB_API_KEY")
        assert api_key is not None, "Please set RHUB_API_KEY!"

        id = self.get('pureId')
        url = (f'https://research-hub.nrel.gov/ws/api/524/persons/'
               f'{id}/research-outputs?size=100'
               f'&apiKey={api_key}')
        session = requests.Session()
        response = session.get(url, headers={'Accept': 'application/json'})

        content = response.json()['items']

        pubs = []

        for pub in content:
            title = pub.get('title').get('value')
            pubs.append(title)

        return pubs


[docs]
    def download(self, fp):
        """Download text file containing researchers profile information.

        Parameters
        ----------
        fp : str
            Filepath to download this record to.
        """
        name = self.title

        if self.position:
            full = (f"The following is a brief biography for {name} "
                    f"who is a {self.position} for the National Renewable "
                    f"Energy Laboratory: ")
        else:
            full = (f"The following is a brief biography for {name} "
                    f"who works for the National Renewable "
                    f"Energy Laboratory: ")

        profile, interests, experience = self.profile_information

        if profile:
            full += profile + ' '

        if interests:
            research = (f"{name}'s research interests include: "
                        f"{interests}. ")
            full += research

        if experience:
            research = (f"{name}'s professional experience includes: "
                        f"{experience}. ")
            full += research

        if self.education:
            for edu in self.education:
                full += edu

        if self.publications:
            publications = (f"{name} has been involved in the following "
                            f"publications: {', '.join(self.publications)}. ")
            full += publications

        with open(fp, "w") as text_file:
            text_file.write(full)





[docs]
class ProfilesList(list):
    """Class to retrieve and handle multiple profiles from an API URL.
    This class requires setting an 'RHUB_API_KEY' environment
    variable to access the Pure Web Service. The API key can be
    obtained by contacting an NREL library representative:
    Library@nrel.gov.
    """
    def __init__(self, url, n_pages=1):
        """
        Parameters
        ----------
        url : str
            Research Hub API URL to request, see this for details:
                https://research-hub.nrel.gov/ws/api/524/api-docs/index.html
        n_pages : int
            Number of pages to get from the API. Typical response has 20
            entries per page. Default of 1 ensures that this class doesnt hang
            on a million responses.
        """
        api_key = os.getenv("RHUB_API_KEY")
        assert api_key is not None, "Please set RHUB_API_KEY!"

        self.url = url
        self._session = requests.Session()
        self._response = None
        self._n_pages = 0
        self._iter = 0

        records = self._get_all(n_pages)
        records = [ProfilesRecord(single) for single in records]
        records = [prof for prof in records if prof.last_name != 'NREL']
        super().__init__(records)

    def _get_first(self):
        """Get the first page of Profiles.

        Returns
        -------
        first_page : list
            First page of records as a list.
        """
        self._response = self._session.get(
            self.url,
            headers={'Accept': 'application/json'}
        )

        resp = self._response.json()

        if not self._response.ok:
            msg = ('API Request got error {}: "{}"'
                   .format(self._response.status_code,
                           self._response.reason))
            raise RuntimeError(msg)
        first_page = self._response.json()['items']

        self._n_pages = 1

        if 'last' not in self._response.links:
            count_pages = resp['count'] / resp['pageInformation'].get('size')
            self._n_pages = math.ceil(count_pages)
        else:
            url = self._response.links['last']['url']
            self._n_pages = int(url.split('page=')[-1])

        logger.debug('Found approximately {} records.'
                     .format(self._n_pages * len(first_page)))

        return first_page

    def _get_pages(self, n_pages):
        """Get response pages up to n_pages from Research Hub.

        Parameters
        ----------
        n_pages : int
            Number of pages to retrieve

        Returns
        -------
        next_pages : list
            This function will return a generator of next pages, each of which
            is a list of profiles.
        """
        if n_pages > 1:
            for page in range(2, self._n_pages + 1):
                if page <= n_pages:
                    next_page = self._session.get(
                        self.url,
                        params={'page': page},
                        headers={'Accept': 'application/json'})

                    next_page = next_page.json()['items']
                    yield next_page
                else:
                    break

    def _get_all(self, n_pages):
        """Get all pages of profiles up to n_pages.

        Parameters
        ----------
        n_pages : int
            Number of pages to retrieve

        Returns
        -------
        all_records : list
            List of all publication records.
        """
        first_page = self._get_first()
        records = first_page

        for page in self._get_pages(n_pages):
            records.extend(page)

        return records


[docs]
    def meta(self):
        """Get a meta dataframe with details on all of the profiles.

        Returns
        -------
        df : pd.DataFrame
            Dataframe containing all metadata information.
        """
        i = 0
        attrs = ('title', 'email', 'url', 'id')
        df = pd.DataFrame(columns=attrs)
        for record in self:
            for attr in attrs:
                out = getattr(record, attr)
                if not isinstance(out, str):
                    out = json.dumps(out)
                df.at[i, attr] = out
            df.at[i, 'fn'] = f'{record.id}.txt'
            df.at[i, 'category'] = 'Researcher Profile'
            i += 1

        return df



[docs]
    def download(self, out_dir):
        """Download all profiles from the records in this object into a
        directory. TXT files will be given file names based on researcher ID.

        Parameters
        ----------
        out_dir : str
            Directory to download TXT files to. This directory will be created
            if it does not already exist.
        """
        os.makedirs(out_dir, exist_ok=True)
        for record in self:
            fn = record.id
            fp_out = os.path.join(out_dir, fn + '.txt')
            if not os.path.exists(fp_out):
                try:
                    record.download(fp_out)
                except Exception as e:
                    print(f"Could not download {record.title} with error {e}")
                    logger.exception('Could not download profile ID {}: {}'
                                     .format(record.title, e))
        logger.info('Finished Profiles download!')





[docs]
class PublicationsRecord(dict):
    """Class to handle a single publication as dictionary data.
    This class requires setting an 'RHUB_API_KEY' environment
    variable to access the Pure Web Service. The API key can be
    obtained by contacting an NREL library representative:
    Library@nrel.gov.
    """
    def __init__(self, record):
        """
        Parameters
        ----------
        record : dict
            Research Hub record in dict form, typically a response from API.
        """
        api_key = os.getenv("RHUB_API_KEY")
        assert api_key is not None, "Please set RHUB_API_KEY!"

        assert isinstance(record, dict)
        super().__init__(**record)

    @property
    def title(self):
        """Get the title of this publication.

        Returns
        -------
        title : str
            Publication title.
        """
        title = self.get('title').get('value')

        return title

    @property
    def year(self):
        """Get the publish year.

        Returns
        -------
        year : int
            Year of publication.
        """
        status = self.get('publicationStatuses')[0]
        year = status.get('publicationDate').get('year')

        return year

    @property
    def url(self):
        """Get the url associated with the publication.

        Returns
        -------
        url : str
            Publication URL.
        """
        info = self.get('info')
        url = info.get('portalUrl')

        return url

    @property
    def id(self):
        """Get the 'NREL Publication Number' for
        this record.

        Returns
        -------
        id : str
            Publication Number.
        """
        try:
            group = self.get('keywordGroups')[0]
            cont = group.get('keywordContainers')[0]
            id = cont.get('freeKeywords')[0].get('freeKeywords')[0]
            id = id.replace('/', '-')
        except TypeError:
            id = self.get('externalId')

        return id

    @property
    def authors(self):
        """Get the names of all authors for a publication.

        Returns
        -------
        out : str
            String containing author names.
        """
        pa = self.get('personAssociations')

        if not pa:
            return None

        authors = []

        for r in pa:
            name = r.get('name')

            if not name:
                continue

            first = name.get('firstName')
            last = name.get('lastName')
            full = " ".join(filter(bool, [first, last]))

            if not full:
                continue

            authors.append(full)

        out = ', '.join(authors)

        return out

    @property
    def category(self):
        """Get the publication category for this record.

        Returns
        -------
        cat : str
            Publication category, ex: Technical Report, Article.
        """
        type = self.get('type')
        term = type.get('term')
        cat = term.get('text')[0].get('value')

        return cat

    @property
    def links(self):
        """Get the doi and pdf links for a publication.

        Returns
        -------
        doi : str
            doi link for publication.
        pdf_url : str
            pdf link for publication.
        """
        ev = self.get('electronicVersions')

        doi = None
        pdf_url = None
        if ev:
            for link in ev:
                if link.get('doi'):
                    doi = link.get('doi')
                if link.get('link'):
                    pdf_url = link.get('link')

        return doi, pdf_url

    @property
    def abstract(self):
        """Get the abstract text for this publication.

        Returns
        -------
        value : str
            String containing abstract text.
        """
        abstract = self.get('abstract')

        if not abstract:
            return None

        text = abstract.get('text')

        if not text:
            return None

        value = text[0].get('value')

        return value


[docs]
    def save_abstract(self, abstract_text, out_fp):
        """Download abstract text to .txt file to the directory
        provided.

        Parameters
        ----------
        abstract_text : str
            String with abstract text.
        out_dir : str
            Directory to download TXT files to.
        """
        title = self.title
        full = f"The report titled {title} can be summarized as follows: "
        full += abstract_text

        with open(out_fp, "w") as text_file:
            text_file.write(full)



[docs]
    def download(self, pdf_dir, txt_dir):
        """Download PDFs and TXT files to the directories provided. If a record
        does not fit the criteria for PDF download, a TXT file with the record
        abstract will be saved to the TXT directory.

        Parameters
        ----------
        pdf_dir : str
            Directory for pdf download.
        txt_dir : str
            Directory for txt download.
        """

        category = self.category
        pdf_url = self.links[1]
        abstract = self.abstract

        pdf_categories = ['Technical Report', 'Paper', 'Fact Sheet']

        if category not in pdf_categories:
            fn = self.id.replace('/', '-') + '.txt'
            fp = os.path.join(txt_dir, fn)
            if not os.path.exists(fp):
                if abstract:
                    self.save_abstract(abstract, fp)
                else:
                    logger.info(f'{self.title}: does not have an '
                                'abstract to downlod')
        else:
            if pdf_url and pdf_url.endswith('.pdf'):
                fn = self.id.replace('/', '-') + '.pdf'
                fp = os.path.join(pdf_dir, fn)
                if not os.path.exists(fp):
                    session = requests.Session()
                    response = session.get(pdf_url)
                    with open(fp, 'wb') as f_pdf:
                        f_pdf.write(response.content)
            else:
                fn = self.id.replace('/', '-') + '.txt'
                fp = os.path.join(txt_dir, fn)
                self.save_abstract(abstract, fp)





[docs]
class PublicationsList(list):
    """Class to retrieve and handle multiple publications from an API URL.
    This class requires setting an 'RHUB_API_KEY' environment
    variable to access the Pure Web Service. The API key can be
    obtained by contacting an NREL library representative:
    Library@nrel.gov.
    """
    def __init__(self, url, n_pages=1):
        """
        Parameters
        ----------
        url : str
            Research Hub API URL to request, see this for details:
                https://research-hub.nrel.gov/ws/api/524/api-docs/index.html
        n_pages : int
            Number of pages to get from the API. Typical response has 20
            entries per page. Default of 1 ensures that this class doesnt hang
            on a million responses.
        """
        api_key = os.getenv("RHUB_API_KEY")
        assert api_key is not None, "Please set RHUB_API_KEY!"

        self.url = url
        self._session = requests.Session()
        self._response = None
        self._n_pages = 0
        self._iter = 0

        records = self._get_all(n_pages)
        records = [PublicationsRecord(single) for single in records]
        super().__init__(records)

    def _get_first(self):
        """Get the first page of publications

        Returns
        -------
        first_page : list
            Publication records as list.
        """
        self._response = self._session.get(
            self.url,
            headers={'Accept': 'application/json'})

        resp = self._response.json()

        if not self._response.ok:
            msg = ('API Request got error {}: "{}"'
                   .format(self._response.status_code,
                           self._response.reason))
            raise RuntimeError(msg)
        first_page = self._response.json()['items']

        self._n_pages = 1

        if 'last' not in self._response.links:
            count_pages = resp['count'] / resp['pageInformation'].get('size')
            self._n_pages = math.ceil(count_pages)
        else:
            url = self._response.links['last']['url']
            self._n_pages = int(url.split('page=')[-1])

        logger.debug('Found approximately {} records.'
                     .format(self._n_pages * len(first_page)))

        return first_page

    def _get_pages(self, n_pages):
        """Get response pages up to n_pages from Research Hub.

        Parameters
        ----------
        n_pages : int
            Number of pages to retrieve

        Returns
        -------
        next_pages : list
            This function will return a generator of next pages, each of which
            is a list of records.
        """
        if n_pages > 1:
            for page in range(2, self._n_pages + 1):
                if page <= n_pages:
                    next_page = self._session.get(
                        self.url,
                        params={'page': page},
                        headers={'Accept': 'application/json'})

                    next_page = next_page.json()['items']
                    yield next_page
                else:
                    break

    def _get_all(self, n_pages):
        """Get all pages of publications up to n_pages.

        Parameters
        ----------
        n_pages : int
            Number of pages to retrieve

        Returns
        -------
        all_records : list
            List of all publication records.
        """
        first_page = self._get_first()
        records = first_page

        for page in self._get_pages(n_pages):
            records.extend(page)

        return records


[docs]
    def meta(self):
        """Get a meta dataframe with details on all of the publications.

        Returns
        -------
        df : pd.DataFrame
            Dataframe containing all metadata information.
        """
        i = 0
        attrs = ('title', 'year', 'url', 'id', 'category', 'authors')
        df = pd.DataFrame(columns=attrs)
        for record in self:
            doi = record.links[0]
            pdf_url = record.links[1]
            for attr in attrs:
                out = getattr(record, attr)
                if not isinstance(out, str):
                    out = json.dumps(out)
                df.at[i, attr] = out
            df.at[i, 'doi'] = doi
            df.at[i, 'pdf_url'] = pdf_url
            i += 1

        return df



[docs]
    def download(self, pdf_dir, txt_dir):
        """Download all PDFs and abstract TXTs from the records in this
        objectbinto a directory. Files will be given file names based
        on their record ID.

        Parameters
        ----------
        pdf_dir : str
            Directory to download PDFs to. This directory will be created
            if it does not already exist.
        txt_dir : str
            Directory to download TXTs to. This directory will be created
            if it does not already exist.
        """
        os.makedirs(pdf_dir, exist_ok=True)
        os.makedirs(txt_dir, exist_ok=True)
        for record in self:
            try:
                record.download(pdf_dir, txt_dir)
            except Exception as e:
                logger.exception('Could not download {}: {}'
                                 .format(record.title, e))
        logger.info('Finished publications download!')