Code to build Corpus from the researcher hub.
import os
import os.path
import logging
import json
import math
import re
import requests
import pandas as pd
logger = logging.getLogger(__name__)
class ProfilesRecord(dict):
"""Class to handle a single profiles as dictionary data.
This class requires setting an 'RHUB_API_KEY' environment
variable to access the Pure Web Service. The API key can be
obtained by contacting an NREL library representative:
def __init__(self, record):
record : dict
Profile in dict form, typically a response from the API.
api_key = os.getenv("RHUB_API_KEY")
assert api_key is not None, "Please set RHUB_API_KEY!"
assert isinstance(record, dict)
def clean_text(html_text):
"""Clean html text from API response
html_text : str
Text containing html characters.
clean : str
Text with html characters removed.
clean = re.sub(r'<.*?>', '', html_text)
clean = clean.replace('\xa0', ' ')
return clean
def first_name(self):
"""Get the first name of this researcher.
first : str
Full name of researcher.
names = self.get('name')
first = names.get('firstName')
return first
def last_name(self):
"""Get the last name of this researcher.
last : str
Last name of researcher.
names = self.get('name')
last = names.get('lastName')
return last
def title(self):
"""Get the full name of this researcher.
full : str
Full name of researcher.
names = self.get('name')
first = names.get('firstName')
last = names.get('lastName')
full = first + ' ' + last
return full
def email(self):
"""Get the email address of this researcher.
email : str
Email address of researcher.
email = None
orgs = self.get('staffOrganisationAssociations')
if orgs:
emails_dict = orgs[0].get('emails')
if emails_dict:
email = emails_dict[0].get('value').get('value')
return email
def url(self):
"""Get the url or this researcher's profile.
url : str
URL to researcher's profile.
info = self.get('info')
url = info.get('portalUrl')
return url
def id(self):
"""Get API ID of researcher.
id : str
Researcher ID.
level = self.get('ids')[0]
id = level.get('value').get('value')
return id
def position(self):
"""Get the position of this researcher.
position : str
Researcher's position.
position = None
org = self.get('staffOrganisationAssociations')
if org:
info = org[0].get('jobDescription')
text = info.get('text')[0]
position = text.get('value')
return position
def profile_information(self):
"""Get key profile information for this record:
Personal Profile, Research Interests, Professional Experience.
bio : str
Researcher's profile text.
interests : str
Text from Research Interests section.
experience : str
Text from Professional Experience section.
prof = self.get('profileInformations')
bio = None
interests = None
experience = None
if prof:
for section in prof:
type = section.get('type').get('term')
if 'Personal Profile' in str(type):
info = section.get('value').get('text')[0]
bio = info.get('value')
bio = self.clean_text(bio)
if 'Research Interests' in str(type):
info = section.get('value').get('text')[0]
interests = info.get('value')
interests = self.clean_text(interests)
if 'Professional Experience' in str(type):
info = section.get('value').get('text')[0]
experience = info.get('value')
experience = self.clean_text(experience)
return bio, interests, experience
def education(self):
"""Get the education information of this researcher.
levels : list
Degree levels, ex: Master, Bachelor, PhD
degs : list
Area of study, ex: Mechanical Engineering
schools : list
School awarding the degree.
researcher_name = self.title
edu = self.get('educations')
out_strings = []
if edu:
for e in edu:
if e.get('projectTitle'):
quali = e.get('qualification')
level = quali.get('term').get('text')[0].get('value')
deg = e.get('projectTitle').get('text')[0].get('value')
org = e.get('organisationalUnits')
if org:
value = org[0].get('externalOrganisationalUnit')
name = value.get('name')
school = name.get('text')[0].get('value')
deg_school = deg
deg = deg_school.split(',')[0]
school = deg_school.split(',')[1]
deg_string = (f'{researcher_name} has a {level} '
f'degree in {deg} from {school}. ')
quali = e.get('qualification')
level = quali.get('term').get('text')[0].get('value')
org = e.get('organisationalUnits')[0]
org_unit = org.get('externalOrganisationalUnit')
name = org_unit.get('name')
school = name.get('text')[0].get('value')
deg_string = (f'{researcher_name} has a {level} '
f'degree from {school}. ')
except Exception:
return out_strings
def publications(self):
"""Get the publications this researcher contributed to.
pubs : list
All publications associated with this researcher.
api_key = os.getenv("RHUB_API_KEY")
assert api_key is not None, "Please set RHUB_API_KEY!"
id = self.get('pureId')
url = (f'https://research-hub.nrel.gov/ws/api/524/persons/'
session = requests.Session()
response = session.get(url, headers={'Accept': 'application/json'})
content = response.json()['items']
pubs = []
for pub in content:
title = pub.get('title').get('value')
return pubs
def download(self, fp):
"""Download text file containing researchers profile information.
fp : str
Filepath to download this record to.
name = self.title
if self.position:
full = (f"The following is a brief biography for {name} "
f"who is a {self.position} for the National Renewable "
f"Energy Laboratory: ")
full = (f"The following is a brief biography for {name} "
f"who works for the National Renewable "
f"Energy Laboratory: ")
profile, interests, experience = self.profile_information
if profile:
full += profile + ' '
if interests:
research = (f"{name}'s research interests include: "
f"{interests}. ")
full += research
if experience:
research = (f"{name}'s professional experience includes: "
f"{experience}. ")
full += research
if self.education:
for edu in self.education:
full += edu
if self.publications:
publications = (f"{name} has been involved in the following "
f"publications: {', '.join(self.publications)}. ")
full += publications
with open(fp, "w") as text_file:
class ProfilesList(list):
"""Class to retrieve and handle multiple profiles from an API URL.
This class requires setting an 'RHUB_API_KEY' environment
variable to access the Pure Web Service. The API key can be
obtained by contacting an NREL library representative:
def __init__(self, url, n_pages=1):
url : str
Research Hub API URL to request, see this for details:
n_pages : int
Number of pages to get from the API. Typical response has 20
entries per page. Default of 1 ensures that this class doesnt hang
on a million responses.
api_key = os.getenv("RHUB_API_KEY")
assert api_key is not None, "Please set RHUB_API_KEY!"
self.url = url
self._session = requests.Session()
self._response = None
self._n_pages = 0
self._iter = 0
records = self._get_all(n_pages)
records = [ProfilesRecord(single) for single in records]
records = [prof for prof in records if prof.last_name != 'NREL']
def _get_first(self):
"""Get the first page of Profiles.
first_page : list
First page of records as a list.
self._response = self._session.get(
headers={'Accept': 'application/json'}
resp = self._response.json()
if not self._response.ok:
msg = ('API Request got error {}: "{}"'
raise RuntimeError(msg)
first_page = self._response.json()['items']
self._n_pages = 1
if 'last' not in self._response.links:
count_pages = resp['count'] / resp['pageInformation'].get('size')
self._n_pages = math.ceil(count_pages)
url = self._response.links['last']['url']
self._n_pages = int(url.split('page=')[-1])
logger.debug('Found approximately {} records.'
.format(self._n_pages * len(first_page)))
return first_page
def _get_pages(self, n_pages):
"""Get response pages up to n_pages from Research Hub.
n_pages : int
Number of pages to retrieve
next_pages : list
This function will return a generator of next pages, each of which
is a list of profiles.
if n_pages > 1:
for page in range(2, self._n_pages + 1):
if page <= n_pages:
next_page = self._session.get(
params={'page': page},
headers={'Accept': 'application/json'})
next_page = next_page.json()['items']
yield next_page
def _get_all(self, n_pages):
"""Get all pages of profiles up to n_pages.
n_pages : int
Number of pages to retrieve
all_records : list
List of all publication records.
first_page = self._get_first()
records = first_page
for page in self._get_pages(n_pages):
return records
def download(self, out_dir):
"""Download all profiles from the records in this object into a
directory. TXT files will be given file names based on researcher ID.
out_dir : str
Directory to download TXT files to. This directory will be created
if it does not already exist.
os.makedirs(out_dir, exist_ok=True)
for record in self:
fn = record.id
fp_out = os.path.join(out_dir, fn + '.txt')
if not os.path.exists(fp_out):
except Exception as e:
print(f"Could not download {record.title} with error {e}")
logger.exception('Could not download profile ID {}: {}'
.format(record.title, e))
logger.info('Finished Profiles download!')
class PublicationsRecord(dict):
"""Class to handle a single publication as dictionary data.
This class requires setting an 'RHUB_API_KEY' environment
variable to access the Pure Web Service. The API key can be
obtained by contacting an NREL library representative:
def __init__(self, record):
record : dict
Research Hub record in dict form, typically a response from API.
api_key = os.getenv("RHUB_API_KEY")
assert api_key is not None, "Please set RHUB_API_KEY!"
assert isinstance(record, dict)
def title(self):
"""Get the title of this publication.
title : str
Publication title.
title = self.get('title').get('value')
return title
def year(self):
"""Get the publish year.
year : int
Year of publication.
status = self.get('publicationStatuses')[0]
year = status.get('publicationDate').get('year')
return year
def url(self):
"""Get the url associated with the publication.
url : str
Publication URL.
info = self.get('info')
url = info.get('portalUrl')
return url
def id(self):
"""Get the 'NREL Publication Number' for
this record.
id : str
Publication Number.
group = self.get('keywordGroups')[0]
cont = group.get('keywordContainers')[0]
id = cont.get('freeKeywords')[0].get('freeKeywords')[0]
id = id.replace('/', '-')
except TypeError:
id = self.get('externalId')
return id
def authors(self):
"""Get the names of all authors for a publication.
out : str
String containing author names.
pa = self.get('personAssociations')
if not pa:
return None
authors = []
for r in pa:
name = r.get('name')
if not name:
first = name.get('firstName')
last = name.get('lastName')
full = " ".join(filter(bool, [first, last]))
if not full:
out = ', '.join(authors)
return out
def category(self):
"""Get the publication category for this record.
cat : str
Publication category, ex: Technical Report, Article.
type = self.get('type')
term = type.get('term')
cat = term.get('text')[0].get('value')
return cat
def links(self):
"""Get the doi and pdf links for a publication.
doi : str
doi link for publication.
pdf_url : str
pdf link for publication.
ev = self.get('electronicVersions')
doi = None
pdf_url = None
if ev:
for link in ev:
if link.get('doi'):
doi = link.get('doi')
if link.get('link'):
pdf_url = link.get('link')
return doi, pdf_url
def abstract(self):
"""Get the abstract text for this publication.
value : str
String containing abstract text.
abstract = self.get('abstract')
if not abstract:
return None
text = abstract.get('text')
if not text:
return None
value = text[0].get('value')
return value
def save_abstract(self, abstract_text, out_fp):
"""Download abstract text to .txt file to the directory
abstract_text : str
String with abstract text.
out_dir : str
Directory to download TXT files to.
title = self.title
full = f"The report titled {title} can be summarized as follows: "
full += abstract_text
with open(out_fp, "w") as text_file:
def download(self, pdf_dir, txt_dir):
"""Download PDFs and TXT files to the directories provided. If a record
does not fit the criteria for PDF download, a TXT file with the record
abstract will be saved to the TXT directory.
pdf_dir : str
Directory for pdf download.
txt_dir : str
Directory for txt download.
category = self.category
pdf_url = self.links[1]
abstract = self.abstract
pdf_categories = ['Technical Report', 'Paper', 'Fact Sheet']
if category not in pdf_categories:
fn = self.id.replace('/', '-') + '.txt'
fp = os.path.join(txt_dir, fn)
if not os.path.exists(fp):
if abstract:
self.save_abstract(abstract, fp)
logger.info(f'{self.title}: does not have an '
'abstract to downlod')
if pdf_url and pdf_url.endswith('.pdf'):
fn = self.id.replace('/', '-') + '.pdf'
fp = os.path.join(pdf_dir, fn)
if not os.path.exists(fp):
session = requests.Session()
response = session.get(pdf_url)
with open(fp, 'wb') as f_pdf:
fn = self.id.replace('/', '-') + '.txt'
fp = os.path.join(txt_dir, fn)
self.save_abstract(abstract, fp)
class PublicationsList(list):
"""Class to retrieve and handle multiple publications from an API URL.
This class requires setting an 'RHUB_API_KEY' environment
variable to access the Pure Web Service. The API key can be
obtained by contacting an NREL library representative:
def __init__(self, url, n_pages=1):
url : str
Research Hub API URL to request, see this for details:
n_pages : int
Number of pages to get from the API. Typical response has 20
entries per page. Default of 1 ensures that this class doesnt hang
on a million responses.
api_key = os.getenv("RHUB_API_KEY")
assert api_key is not None, "Please set RHUB_API_KEY!"
self.url = url
self._session = requests.Session()
self._response = None
self._n_pages = 0
self._iter = 0
records = self._get_all(n_pages)
records = [PublicationsRecord(single) for single in records]
def _get_first(self):
"""Get the first page of publications
first_page : list
Publication records as list.
self._response = self._session.get(
headers={'Accept': 'application/json'})
resp = self._response.json()
if not self._response.ok:
msg = ('API Request got error {}: "{}"'
raise RuntimeError(msg)
first_page = self._response.json()['items']
self._n_pages = 1
if 'last' not in self._response.links:
count_pages = resp['count'] / resp['pageInformation'].get('size')
self._n_pages = math.ceil(count_pages)
url = self._response.links['last']['url']
self._n_pages = int(url.split('page=')[-1])
logger.debug('Found approximately {} records.'
.format(self._n_pages * len(first_page)))
return first_page
def _get_pages(self, n_pages):
"""Get response pages up to n_pages from Research Hub.
n_pages : int
Number of pages to retrieve
next_pages : list
This function will return a generator of next pages, each of which
is a list of records.
if n_pages > 1:
for page in range(2, self._n_pages + 1):
if page <= n_pages:
next_page = self._session.get(
params={'page': page},
headers={'Accept': 'application/json'})
next_page = next_page.json()['items']
yield next_page
def _get_all(self, n_pages):
"""Get all pages of publications up to n_pages.
n_pages : int
Number of pages to retrieve
all_records : list
List of all publication records.
first_page = self._get_first()
records = first_page
for page in self._get_pages(n_pages):
return records
def download(self, pdf_dir, txt_dir):
"""Download all PDFs and abstract TXTs from the records in this
objectbinto a directory. Files will be given file names based
on their record ID.
pdf_dir : str
Directory to download PDFs to. This directory will be created
if it does not already exist.
txt_dir : str
Directory to download TXTs to. This directory will be created
if it does not already exist.
os.makedirs(pdf_dir, exist_ok=True)
os.makedirs(txt_dir, exist_ok=True)
for record in self:
record.download(pdf_dir, txt_dir)
except Exception as e:
logger.exception('Could not download {}: {}'
.format(record.title, e))
logger.info('Finished publications download!')