Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Create updating_docs_md.py #30

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
195 changes: 195 additions & 0 deletions updating_metadata_docs/updating_docs_md.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,195 @@
import gspread
import pandas as pd
from oauth2client.service_account import ServiceAccountCredentials
from googleapiclient.discovery import build

#Before starting you need to set credentials in the anaconda prompt
#set GOOGLE_APPLICATION_CREDENTIALS=

# Part of this code is Licensed under the Apache License, Version 2.0
# Copy of License at http://www.apache.org/licenses/LICENSE-2.0

#filepath for your credentials
cred_file = ""

#define the scopes you may need to use, discovery info
scope = ["https://www.googleapis.com/auth/spreadsheets",
"https://www.googleapis.com/auth/drive.file",
"https://www.googleapis.com/auth/documents",
"https://www.googleapis.com/auth/drive.metadata.readonly"]

#build helps call the api that you're going to use - in this case docs and drive
drive_service = build('drive', 'v3')
doc_service = build('docs', 'v1')
sheets_service = build('sheets', 'v4')

#used to authenticate Google Spreadsheets
credentials = ServiceAccountCredentials.from_json_keyfile_name(cred_file, scope)
gc = gspread.authorize(credentials)

#Google Sheet you will be using to store metadata
SPREADSHEET_ID = "" #add id
workbook = gc.open_by_key(SPREADSHEET_ID)
sheet = workbook.worksheet("Master")

#fields from the RW metadata template
#this is used in creating a dataframe as well (see: metadata_df)
#if any of these fields change, you must change metadata_df as well
doc_fields = ["Source Organizations", "Function", "Overview", "Methodology",
"Cautions", "Additional Information", "Visualizing the Data",
"Citation", "License", "Disclaimer", "Direct Download link",
"Download from Source link", "Learn More link", "Formal Name",
"Date of Content", "Spatial Resolution", "Frequency of Updates",
"Geographic Coverage", "Data Format",
"Subtitle \n(example: UNEP-WCMC/IUCN)", "Published Language",
"Data shown on Resource Watch Map"]

"""
Extract text from Google Doc and save it as a dictionary.
"""
def read_paragraph_element(element):
"""Returns the text in the given ParagraphElement.
"""
text_run = element.get('textRun')
if not text_run:
return ''
return text_run.get('content')
def read_strucutural_elements(elements):
"""Recurses through a list of Structural Elements to read a document's text where text may be
in nested elements.

Args:
elements: a list of Structural Elements.
"""
text = ''
for value in elements:
if 'paragraph' in value:
elements = value.get('paragraph').get('elements')
for elem in elements:
text += read_paragraph_element(elem)
elif 'table' in value:
# The text in table cells are in nested Structural Elements and tables may be
# nested.
table = value.get('table')
for row in table.get('tableRows'):
cells = row.get('tableCells')
for cell in cells:
text += read_strucutural_elements(cell.get('content'))
elif 'tableOfContents' in value:
# The text in the TOC is also in a Structural Element.
toc = value.get('tableOfContents')
text += read_strucutural_elements(toc.get('content'))
return text

'''
Creates dataframe that contains metadata from the Google Docs
'''
def metadata_dataframe():

doc_files = drive_service.files().list(q="mimeType='application/vnd.google-apps.document'", spaces='drive',
fields='files(id, name)').execute()
doc_info = doc_files['files']

metadata_dict = []
for x in range(len(doc_info)):
#get the document id and name
DOCUMENT_ID = doc_info[x]['id']
DOCUMENT_NAME = doc_info[x]['name']

#gets the text in a documents
doc = doc_service.documents().get(documentId=DOCUMENT_ID).execute()
doc_content = doc.get('body').get('content')
doc_text = read_strucutural_elements(doc_content)

#checks if metadata fields are repeated in the text
#if fields appear more than once
#consider revising the document on Google Docs
for metadata_field in doc_fields:
if metadata_field not in doc_text:
print("The %s is not in the %s text" % (metadata_field, DOCUMENT_NAME))
exit()
elif doc_text.count(metadata_field) != 1:
print("The %s repeats more than once in %s" % (metadata_field, DOCUMENT_NAME))
exit()

print("%s ready for ingestion." % DOCUMENT_NAME)

#stores text in dictionary based on metadata fields
doc_dict = {}
n = 1
for field in range(len(doc_fields)-1):
space_index = DOCUMENT_NAME.index(" ")
doc_dict['Public Title'] = DOCUMENT_NAME[space_index+1:]
doc_dict['WRI_ID'] = DOCUMENT_NAME[:space_index]
metadata_text = doc_text.partition(doc_fields[n-1])[2]
field_text = metadata_text.partition(doc_fields[n])[0].strip()
#remaining_text = metadata_text.partition(doc_fields[n])[2]
doc_dict[doc_fields[n-1]] = field_text
n += 1

metadata_dict += [doc_dict]
print("%s metadata has been saved." % DOCUMENT_NAME)
#creates a pandas datafrmae
df = pd.DataFrame(metadata_dict)
df['Processed Data Link (S3)'] = ""
#reorders columns so that WRI_ID and Public Title appear first
df = df[[
'WRI_ID',
'Public Title',
'Source Organizations',
'Function',
'Overview',
'Methodology',
'Cautions',
'Additional Information',
'Visualizing the Data',
'Citation',
'License',
'Disclaimer',
'Direct Download link',
'Processed Data Link (S3)',
'Download from Source link',
'Learn More link',
'Formal Name',
'Date of Content',
'Spatial Resolution',
'Frequency of Updates',
'Geographic Coverage',
'Data Format',

'Subtitle \n(example: UNEP-WCMC/IUCN)',
'Published Language'
]]

return df

'''
Takes dataframe and adds it to a Google spreadsheet
'''
def iter_pd(df):
for val in df.columns:
yield val
for row in df.to_numpy():
for val in row:
if pd.isna(val):
yield ""
else:
yield val
def pandas_to_sheets(pandas_df, sheet, clear = True):
# Updates all values in a workbook to match a pandas dataframe
if clear:
sheet.clear()
(row, col) = pandas_df.shape
cells = sheet.range("A1:{}".format(gspread.utils.rowcol_to_a1(row + 1, col)))
for cell, val in zip(cells, iter_pd(pandas_df)):
cell.value = val
sheet.update_cells(cells)
print("Google Sheet has been updated.")


#Once the docs are in the appropriate folder,
#along with the spreadsheet, run this
pandas_to_sheets(metadata_dataframe(), sheet)