Source code for gdoc_down.core

"""
Save the content of a Google document to a local file.

:Author: Jonathan Karr <karr@mssm.edu>
:Date: 2016-08-16
:Copyright: 2016, Karr Lab
:License: MIT
"""

from bs4 import BeautifulSoup
from oauth2client import tools as oauth2client_tools
from xml.etree import ElementTree
import apiclient
import argparse
import json
import re
import oauth2client
import os


[docs]class GDocDown(object):
    """ Downloads Google documents to several formats

    - HTML (.html)
    - LaTeX (.tex)
    - Open Office document (.odt)
    - Plain text file (.txt)
    - Portable document format (.pdf)
    - Rich text document (.rtf)
    - Word document (.docx)

    The class has several special features for handling LaTeX files:

    - The program ignores all images. This allows the user to place images inside the Google 
      document for convenience and to use \includegraphics to embed images in compile PDF files.
    - The program will convert all Google document comments to PDF comments.
    - The program ignores all page breaks.

    The first time the program is called, the program will request access to the user's Google
    account. This will create a client.json file.

    Attributes:
        credentials (:obj:`oauth2client.client.OAuth2Credentials`): Credentials object for OAuth 2.0.
        service (:obj:`apiclient.discovery.Resource`): A Resource object with methods for interacting with the service
    """

    APPLICATION_NAME = 'gdoc_down'

    CLIENT_SECRET_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'client.json')

    CREDENTIAL_PATH = os.path.join(os.path.expanduser('~'), '.gdoc_down', 'auth.json')

    SCOPES = (
        'https://www.googleapis.com/auth/drive',
        'https://www.googleapis.com/auth/drive.file',
        'https://www.googleapis.com/auth/drive.metadata.readonly',
        'https://www.googleapis.com/auth/drive.readonly',
    )

    def __init__(self, credentials=None, service=None):
        """
        Arguments:
            credentials (:obj:`oauth2client.client.OAuth2Credentials`, optional): Credentials object for OAuth 2.0.
            service (:obj:`apiclient.discovery.Resource`, optional): A Resource object with methods for interacting with the service
        """
        if credentials is None:
            credentials = self.get_credentials()

        if service is None:
            service = self.authenticate(credentials)

        self.credentials = credentials
        self.service = service

[docs]    def get_credentials(self):
        """ Get and save user credentials from Google. If credentials haven't already been 
        stored, or if the stored credentials are invalid, obtain the new credentials. 

        Retuns:
            :obj:`oauth2client.client.OAuth2Credentials`: Credentials object for OAuth 2.0.
        """
        store = oauth2client.file.Storage(GDocDown.CREDENTIAL_PATH)
        credentials = store.get()
        if not credentials or credentials.invalid:
            flow = oauth2client.client.flow_from_clientsecrets(GDocDown.CLIENT_SECRET_PATH, GDocDown.SCOPES)
            flow.user_agent = GDocDown.APPLICATION_NAME
            parser = argparse.ArgumentParser(
                description=__doc__,
                formatter_class=argparse.RawDescriptionHelpFormatter,
                parents=[oauth2client_tools.argparser])
            flags = parser.parse_args([])
            credentials = oauth2client_tools.run_flow(flow, store, flags)

        return credentials

[docs]    def authenticate(self, credentials):
        """ Authenticate with Google server 

        Returns:
            :obj:`apiclient.discovery.Resource`: A Resource object with methods for interacting with the service
        """
        return apiclient.discovery.build('drive', 'v3', credentials=credentials)

[docs]    def download(self, gdoc_file, format='docx', out_path='.', extension=None):
        """
        Args:
            gdoc_file (:obj:`str`): path to Google document
            format (:obj:`str`, optional): desired output format (docx, html, odt, pdf, rtf, tex, txt)
            out_path (:obj:`str`, optional): path to save document
            extension (:obj:`str`, optional): extension to document

        Raises:
            obj:`Exception`: if format unknown or if ouput file path and extension cannot both be specified
        """

        if format == 'docx':
            export_type = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
        elif format == 'html':
            export_type = 'text/html'
        elif format == 'odt':
            export_type = 'application/vnd.oasis.opendocument.text'
        elif format == 'pdf':
            export_type = 'application/pdf'
        elif format == 'rtf':
            export_type = 'application/rtf'
        elif format == 'tex':
            export_type = 'text/html'
        elif format == 'txt':
            export_type = 'text/plain'
        else:
            raise Exception('Unknown format "{}"'.format(format))

        if os.path.isdir(out_path):
            if extension is None:
                extension = format
            root, _ = os.path.splitext(os.path.basename(gdoc_file))
            out_file = os.path.join(out_path, root + "." + extension)
        else:
            if extension is None:
                out_file = out_path
            else:
                raise Exception('Ouput file path and extension cannot both be specified')

        # get google document id
        gdoc_id = self.get_gdoc_id(gdoc_file)

        # download file from Google
        content = self.service.files().export(fileId=gdoc_id, mimeType=export_type).execute()

        # convert content as requested
        if format == 'txt':
            content = content[3:]
        elif format == 'tex':
            content = self.convert_html_to_latex(content)

        # save content to local file
        with open(out_file, "wb") as file:
            file.write(content)

    @staticmethod
[docs]    def get_gdoc_id(gdoc_file):
        """ Get Google document id

        Args:
            gdoc_file (:obj:`str`): path to Google document

        Returns:
            :obj:`str`: id of Google document
        """

        with open(gdoc_file) as data_file:
            data = json.load(data_file)
        return data['doc_id']

    @staticmethod
[docs]    def convert_html_to_latex(html_content):
        """ Format Google document content downloaded in HTML format for LaTeX

        * Replace HTML characters with LaTeX commands
        * Remove images
        * Replace comments with PDF comments (using `pdfcomment` package)

        Args:
            html_content (:obj:`bytes`): HTML version of Google document

        Returns:
            :obj:`bytes`: formatted LaTeX
        """

        # decode content
        html_content = html_content.decode('utf-8')

        """ remove unnecessary content """
        # head
        pattern = re.compile('<head>.*</head>')
        html_content = pattern.sub('', html_content)

        # style
        pattern = re.compile(' style=".*?"')
        html_content = pattern.sub('', html_content)

        # horizontal and break lines
        html_content = html_content \
            .replace('<hr style="page-break-before:always;display:none;">', '') \
            .replace("<br>",  "\n")

        # images
        pattern = re.compile('<img.*?>')
        html_content = pattern.sub('', html_content)

        """ substitute character entities """
        html_content = html_content.replace('&nbsp;', ' ')
        html_content = str(BeautifulSoup(html_content, 'html.parser'))

        """ replace comments with PDF comments (using `pdfcomment` package) """
        # parse html content
        root = ElementTree.fromstring(html_content)

        # find and replace comments
        comment_id = 0
        while True:
            comment_id = comment_id + 1
            comment = root.find((".//a[@id='cmnt%d']" % comment_id))
            comment_parent = root.find((".//a[@id='cmnt%d']/.." % comment_id))
            comment_grandparent = root.find((".//a[@id='cmnt%d']/../.." % comment_id))
            comment_greatgrandparent = root.find((".//a[@id='cmnt%d']/../../.." % comment_id))
            if comment is None:
                break

            # remove numbering from comment
            comment_parent.remove(comment)

            # replace superscript with PDF comment
            ref = root.find((".//a[@id='cmnt_ref%d']" % comment_id))
            ref_parent = root.find((".//a[@id='cmnt_ref%d']/.." % comment_id))
            ref_parent.remove(ref)
            ref_parent.text = ('\pdfcomment{%s}' % GDocDown.get_element_text(comment_grandparent))

            # remove comment footnote
            comment_greatgrandparent.remove(comment_grandparent)

        # collect body text
        tex_content = ''
        for child in list(root.find('./body')):
            tex_content = tex_content + GDocDown.get_element_text(child)
            tex_content = tex_content + "\n\n"

        """ return formatted LaTeX """
        return tex_content.encode('utf-8')

    @staticmethod
[docs]    def get_element_text(element):
        """ Get all of the text underneath an XML element

        Args:
            el (:obj:`xml.etree.ElementTree.Element`): XML element

        Returns:
            :obj:`str`: element's text
        """

        text = element.text or ''
        for child in list(element):
            text = text + GDocDown.get_element_text(child)
        return text