Source code for abydos.util._data

# Copyright 2019-2020 by Christopher C. Little.
# This file is part of Abydos.
#
# Abydos is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# Abydos is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with Abydos. If not, see <http://www.gnu.org/licenses/>.

"""abydos.util._data.

The util._data module manages datasets from
https://github.com/chrislit/abydos-data, including downloading them,
decompressing them, and locating them once installed.

Much of this is copied from NLTK's similar facility in
http://www.nltk.org/_modules/nltk/data.html, because they seem to have the
issues figured out, because I don't want to expend the effort to re-invent a
solution, and because their license (Apache) allows for it.
"""

import os
import re
import sys

try:
    import urllib.request as urllib
except ImportError:  # pragma: no cover
    import urllib
import zipfile

from xml.etree import ElementTree  # noqa: S405

__all__ = [
    'data_path',
    'download_package',
    'list_available_packages',
    'list_installed_packages',
    'package_path',
]


DATA_SUBDIRS = ['corpora']
INDEX_URL = (
    'https://raw.githubusercontent.com/chrislit/abydos-data/master/index.xml'
)

data_path = []
"""A list of directories where the Abydos data package might reside.
   These directories will be checked in order when looking for a
   resource in the data package.  Note that this allows users to
   substitute in their own versions of resources, if they have them
   (e.g., in their home directory under ~/abydos_data)."""

# User-specified locations:
_paths_from_env = os.environ.get('ABYDOS_DATA', str('')).split(
    os.pathsep
)  # pragma: no cover
data_path += [d for d in _paths_from_env if d]
if 'APPENGINE_RUNTIME' not in os.environ and os.path.expanduser('~/') != '~/':
    data_path.append(os.path.expanduser(str('~/abydos_data')))

if sys.platform.startswith('win'):  # pragma: no cover
    # Common locations on Windows:
    data_path += [
        os.path.join(sys.prefix, str('abydos_data')),
        os.path.join(sys.prefix, str('share'), str('abydos_data')),
        os.path.join(sys.prefix, str('lib'), str('abydos_data')),
        os.path.join(
            os.environ.get(str('APPDATA'), str('C:\\')), str('abydos_data')
        ),
        str(r'C:\abydos_data'),
        str(r'D:\abydos_data'),
        str(r'E:\abydos_data'),
    ]
else:
    # Common locations on UNIX & OS X:
    data_path += [
        os.path.join(sys.prefix, str('abydos_data')),
        os.path.join(sys.prefix, str('share'), str('abydos_data')),
        os.path.join(sys.prefix, str('lib'), str('abydos_data')),
        str('/usr/share/abydos_data'),
        str('/usr/local/share/abydos_data'),
        str('/usr/lib/abydos_data'),
        str('/usr/local/lib/abydos_data'),
    ]


[docs]def package_path(resource_name): """Given a resource name, returns the path to the package.""" for path in data_path: for subdir in DATA_SUBDIRS: check_path = os.path.join(path, subdir, resource_name) if os.path.isdir(check_path): return check_path msg = 'Data package not found. You may need to install or re-install it.' raise FileNotFoundError(msg)
[docs]def list_installed_packages(path=None): """List all installed data packages.""" if path: paths = [path] else: paths = data_path packages = [] for path in paths: for subdir in DATA_SUBDIRS: check_path = os.path.join(path, subdir) if os.path.isdir(check_path): possible_packages = os.listdir(check_path) for package in possible_packages: if os.path.isdir(os.path.join(check_path, package)): with open( os.path.join(check_path, package + '.xml') ) as xml: file = xml.read() name = re.search(r'name="([^"]+)"', file).group(1) version = re.search( r'version="([^"]+)"', file ).group(1) packages.append((package, name, float(version))) return packages
[docs]def list_available_packages(url=None): """List all data packages available for install.""" installed_packages = {_[0]: _[2] for _ in list_installed_packages()} if url is None: url = INDEX_URL if url[:8] != 'https://': raise ValueError('url should begin with "https://"') with urllib.urlopen(url) as ix: # noqa: S310 xml = ElementTree.fromstring(ix.read()) # noqa: S314 packages = [ ( _.attrib['id'], _.attrib['name'], float(_.attrib['version']), _.attrib['url'], _.attrib['subdir'], 'not-installed' if _.attrib['id'] not in installed_packages else ( 'up-to-date' if installed_packages[_.attrib['id']] >= float(_.attrib['version']) else 'update available' ), ) for _ in xml.findall('packages/package') ] collections = [ ( _.attrib['id'], _.attrib['name'], [__.attrib['ref'] for __ in _.findall('item')], ) for _ in xml.findall('collections/collection') ] return packages, collections
def _default_download_dir(): """Return the directory to which packages will be downloaded by default. This is mostly copied from NLTK's nltk.downloader.Downloader.default_download_dir """ # Check if we are on GAE where we cannot write into filesystem. if 'APPENGINE_RUNTIME' in os.environ: # pragma: no cover return None # Check if we have sufficient permissions to install in a # variety of system-wide locations. for abydos_data in data_path: if os.path.exists(abydos_data) and os.access( abydos_data, os.W_OK ): # pragma: no cover return abydos_data # On Windows, use %APPDATA% if sys.platform == 'win32' and 'APPDATA' in os.environ: # pragma: no cover homedir = os.environ['APPDATA'] # Otherwise, install in the user's home directory. else: # pragma: no cover homedir = os.path.expanduser('~/') if homedir == '~/': raise ValueError('Could not find a default download directory') # append "abydos_data" to the home directory return os.path.join(homedir, 'abydos_data') # pragma: no cover
[docs]def download_package( resource_name, url=None, data_path=None, force=False, silent=False ): """Download and install a package or collection.""" packages, collections = list_available_packages(url) installed = list_installed_packages(data_path) if data_path is None: data_path = _default_download_dir() os.makedirs(data_path, mode=0o775, exist_ok=True) for coll in collections: if resource_name == coll[0]: if not silent: # pragma: no branch print('Installing {} collection'.format(coll[1])) # noqa: T001 for resource_name in coll[2]: download_package(resource_name, url, data_path) return else: for pack in packages: if resource_name == pack[0]: if not force: for inst in installed: # pragma: no branch if pack[0] == inst[0] and pack[2] <= inst[2]: if not silent: print( # pragma: no cover # noqa: T001 '{} package already up-to-date'.format( pack[1] ) ) return if not silent: # pragma: no branch print( # noqa: T001 'Installing {} package'.format(pack[1]) ) zip_fn = os.path.join(data_path, pack[4], pack[0] + '.zip') os.makedirs( os.path.join(data_path, pack[4]), mode=0o775, exist_ok=True ) urllib.urlretrieve( # noqa: S310 pack[3][:-3] + 'xml', zip_fn[:-3] + 'xml' ) urllib.urlretrieve(pack[3], zip_fn) # noqa: S310 zip_pkg = zipfile.ZipFile(zip_fn) zip_pkg.extractall(os.path.join(data_path, pack[4])) zip_pkg.close() os.remove(zip_fn)
if __name__ == '__main__': import doctest doctest.testmod()