Source code for gdoc_down.core

Save the content of a Google document to a local file.

:Author: Jonathan Karr <>
:Date: 2016-08-16
:Copyright: 2016, Karr Lab
:License: MIT

from bs4 import BeautifulSoup
from oauth2client import tools as oauth2client_tools
from xml.etree import ElementTree
import apiclient
import argparse
import json
import re
import oauth2client
import os

[docs]class GDocDown(object): """ Downloads Google documents to several formats - HTML (.html) - LaTeX (.tex) - Open Office document (.odt) - Plain text file (.txt) - Portable document format (.pdf) - Rich text document (.rtf) - Word document (.docx) The class has several special features for handling LaTeX files: - The program ignores all images. This allows the user to place images inside the Google document for convenience and to use \includegraphics to embed images in compile PDF files. - The program will convert all Google document comments to PDF comments. - The program ignores all page breaks. The first time the program is called, the program will request access to the user's Google account. This will create a client.json file. Attributes: credentials (:obj:`oauth2client.client.OAuth2Credentials`): Credentials object for OAuth 2.0. service (:obj:`apiclient.discovery.Resource`): A Resource object with methods for interacting with the service """ APPLICATION_NAME = 'gdoc_down' CLIENT_SECRET_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'client.json') CREDENTIAL_PATH = os.path.join(os.path.expanduser('~'), '.gdoc_down', 'auth.json') SCOPES = ( '', '', '', '', ) def __init__(self, credentials=None, service=None): """ Arguments: credentials (:obj:`oauth2client.client.OAuth2Credentials`, optional): Credentials object for OAuth 2.0. service (:obj:`apiclient.discovery.Resource`, optional): A Resource object with methods for interacting with the service """ if credentials is None: credentials = self.get_credentials() if service is None: service = self.authenticate(credentials) self.credentials = credentials self.service = service
[docs] def get_credentials(self): """ Get and save user credentials from Google. If credentials haven't already been stored, or if the stored credentials are invalid, obtain the new credentials. Retuns: :obj:`oauth2client.client.OAuth2Credentials`: Credentials object for OAuth 2.0. """ store = oauth2client.file.Storage(GDocDown.CREDENTIAL_PATH) credentials = store.get() if not credentials or credentials.invalid: flow = oauth2client.client.flow_from_clientsecrets(GDocDown.CLIENT_SECRET_PATH, GDocDown.SCOPES) flow.user_agent = GDocDown.APPLICATION_NAME parser = argparse.ArgumentParser( description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter, parents=[oauth2client_tools.argparser]) flags = parser.parse_args([]) credentials = oauth2client_tools.run_flow(flow, store, flags) return credentials
[docs] def authenticate(self, credentials): """ Authenticate with Google server Returns: :obj:`apiclient.discovery.Resource`: A Resource object with methods for interacting with the service """ return'drive', 'v3', credentials=credentials)
[docs] def download(self, gdoc_file, format='docx', out_path='.', extension=None): """ Args: gdoc_file (:obj:`str`): path to Google document format (:obj:`str`, optional): desired output format (docx, html, odt, pdf, rtf, tex, txt) out_path (:obj:`str`, optional): path to save document extension (:obj:`str`, optional): extension to document Raises: obj:`Exception`: if format unknown or if ouput file path and extension cannot both be specified """ if format == 'docx': export_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' elif format == 'html': export_type = 'text/html' elif format == 'odt': export_type = 'application/vnd.oasis.opendocument.text' elif format == 'pdf': export_type = 'application/pdf' elif format == 'rtf': export_type = 'application/rtf' elif format == 'tex': export_type = 'text/html' elif format == 'txt': export_type = 'text/plain' else: raise Exception('Unknown format "{}"'.format(format)) if os.path.isdir(out_path): if extension is None: extension = format root, _ = os.path.splitext(os.path.basename(gdoc_file)) out_file = os.path.join(out_path, root + "." + extension) else: if extension is None: out_file = out_path else: raise Exception('Ouput file path and extension cannot both be specified') # get google document id gdoc_id = self.get_gdoc_id(gdoc_file) # download file from Google content = self.service.files().export(fileId=gdoc_id, mimeType=export_type).execute() # convert content as requested if format == 'txt': content = content[3:] elif format == 'tex': content = self.convert_html_to_latex(content) # save content to local file with open(out_file, "wb") as file: file.write(content)
[docs] def get_gdoc_id(gdoc_file): """ Get Google document id Args: gdoc_file (:obj:`str`): path to Google document Returns: :obj:`str`: id of Google document """ with open(gdoc_file) as data_file: data = json.load(data_file) return data['doc_id']
[docs] def convert_html_to_latex(html_content): """ Format Google document content downloaded in HTML format for LaTeX * Replace HTML characters with LaTeX commands * Remove images * Replace comments with PDF comments (using `pdfcomment` package) Args: html_content (:obj:`bytes`): HTML version of Google document Returns: :obj:`bytes`: formatted LaTeX """ # decode content html_content = html_content.decode('utf-8') """ remove unnecessary content """ # head pattern = re.compile('<head>.*</head>') html_content = pattern.sub('', html_content) # style pattern = re.compile(' style=".*?"') html_content = pattern.sub('', html_content) # horizontal and break lines html_content = html_content \ .replace('<hr style="page-break-before:always;display:none;">', '') \ .replace("<br>", "\n") # images pattern = re.compile('<img.*?>') html_content = pattern.sub('', html_content) """ substitute character entities """ html_content = html_content.replace('&nbsp;', ' ') html_content = str(BeautifulSoup(html_content, 'html.parser')) """ replace comments with PDF comments (using `pdfcomment` package) """ # parse html content root = ElementTree.fromstring(html_content) # find and replace comments comment_id = 0 while True: comment_id = comment_id + 1 comment = root.find((".//a[@id='cmnt%d']" % comment_id)) comment_parent = root.find((".//a[@id='cmnt%d']/.." % comment_id)) comment_grandparent = root.find((".//a[@id='cmnt%d']/../.." % comment_id)) comment_greatgrandparent = root.find((".//a[@id='cmnt%d']/../../.." % comment_id)) if comment is None: break # remove numbering from comment comment_parent.remove(comment) # replace superscript with PDF comment ref = root.find((".//a[@id='cmnt_ref%d']" % comment_id)) ref_parent = root.find((".//a[@id='cmnt_ref%d']/.." % comment_id)) ref_parent.remove(ref) ref_parent.text = ('\pdfcomment{%s}' % GDocDown.get_element_text(comment_grandparent)) # remove comment footnote comment_greatgrandparent.remove(comment_grandparent) # collect body text tex_content = '' for child in list(root.find('./body')): tex_content = tex_content + GDocDown.get_element_text(child) tex_content = tex_content + "\n\n" """ return formatted LaTeX """ return tex_content.encode('utf-8')
[docs] def get_element_text(element): """ Get all of the text underneath an XML element Args: el (:obj:`xml.etree.ElementTree.Element`): XML element Returns: :obj:`str`: element's text """ text = element.text or '' for child in list(element): text = text + GDocDown.get_element_text(child) return text