import re
from typing import Optional, Tuple, Union
PROJECT_NAME = r'[A-Za-z0-9](?:[A-Za-z0-9._-]*[A-Za-z0-9])?'
PROJECT_NAME_NODASH = r'[A-Za-z0-9](?:[A-Za-z0-9._]*[A-Za-z0-9])?'
VERSION = r'[A-Za-z0-9_.!+-]+?'
VERSION_NODASH = r'[A-Za-z0-9_.!+]+?'
ARCHIVE_EXT = r'\.(?:tar|tar\.(?:bz2|gz|lz|lzma|xz|Z)|tbz|tgz|tlz|txz|zip)'
PLAT_NAME = r'(?:aix|cygwin|darwin|linux|macosx|solaris|sunos|[wW]in)[-.A-Za-z0-9_]*'
PYVER = r'py[0-9]+\.[0-9]+'
#: Regexes for package filenames that can be parsed unambiguously
GOOD_PACKAGE_RGXN = [
# See <https://setuptools.readthedocs.io/en/latest
# /formats.html#filename-embedded-metadata>:
('egg', re.compile(r'^(?P<project>{})-(?P<version>{})(?:-{}(?:-{})?)?\.egg$'
.format(PROJECT_NAME_NODASH, VERSION_NODASH, PYVER, PLAT_NAME))),
# See <http://ftp.rpm.org/max-rpm/ch-rpm-file-format.html>:
# (The architecture pattern is mainly just a guess based on what's
# currently on PyPI.)
('rpm', re.compile(r'^(?P<project>{})-(?P<version>{})-[^-]+\.[A-Za-z0-9._]+\.rpm$'
.format(PROJECT_NAME, VERSION_NODASH))),
# Regex adapted from <https://git.io/fAclu>:
('wheel', re.compile(r'^(?P<project>{})-(?P<version>{})(-[0-9][^-]*?)?'
r'-.+?-.+?-.+?\.whl$'
.format(PROJECT_NAME_NODASH, VERSION_NODASH))),
]
#: Partial regexes for package filenames with ambiguous grammars. If a hint as
#: to the expected project name is given, it will be prepended to the regexes
#: when trying to determine a match; otherwise, a generic pattern that matches
#: all project names will be prepended.
BAD_PACKAGE_BASES = [
# See <https://git.io/fAclc>:
('dumb', re.compile(r'-(?P<version>{})\.{}{}$'
.format(VERSION, PLAT_NAME, ARCHIVE_EXT))),
# See <https://git.io/fAclv>:
('msi', re.compile(r'-(?P<version>{})\.{}(?:-{})?\.msi$'
.format(VERSION, PLAT_NAME, PYVER))),
('sdist', re.compile(r'-(?P<version>{}){}$'.format(VERSION, ARCHIVE_EXT))),
# See <https://git.io/fAclL>:
('wininst', re.compile(r'-(?P<version>{})\.{}(?:-{})?\.exe$'
.format(VERSION, PLAT_NAME, PYVER))),
]
#: Regexes for package filenames with ambiguous grammars, using a generic
#: pattern that matches all project names
BAD_PACKAGE_RGXN = [
(pkg_type, re.compile('^(?P<project>' + PROJECT_NAME + ')' + rgx.pattern))
for pkg_type, rgx in BAD_PACKAGE_BASES
]
[docs]def parse_filename(filename: str, project_hint: Optional[str] = None) \
-> Union[Tuple[str, str, str], Tuple[None, None, None]]:
"""
Given the filename of a distribution package, returns a triple of the
project name, project version, and package type. The name and version are
spelled the same as they appear in the filename; no normalization is
performed.
The package type may be any of the following strings:
- ``'dumb'``
- ``'egg'``
- ``'msi'``
- ``'rpm'``
- ``'sdist'``
- ``'wheel'``
- ``'wininst'``
If the filename cannot be parsed, ``(None, None, None)`` is returned.
Note that some filenames (e.g., :file:`1-2-3.tar.gz`) may be ambiguous as
to which part is the project name and which is the version. In order to
resolve the ambiguity, the expected value for the project name (*modulo*
normalization) can be supplied as the ``project_name`` argument to the
function. If the filename can be parsed with the given string in the role
of the project name, the results of that parse will be returned; otherwise,
the function will fall back to breaking the project & version apart at an
unspecified point.
:param str filename: The package filename to parse
:param Optional[str] project_hint: Optionally, the expected value for the
project name (usually the name of the project page on which the
filename was found). The name does not need to be normalized.
:rtype: Union[Tuple[str, str, str], Tuple[None, None, None]]
"""
for pkg_type, rgx in GOOD_PACKAGE_RGXN:
m = rgx.match(filename)
if m:
return (m.group('project'), m.group('version'), pkg_type)
if project_hint is not None:
proj_rgx = re.sub(r'[^A-Za-z0-9]+', '[-_.]+', project_hint)
proj_rgx = re.sub(
r'([A-Za-z])',
lambda m: '[' + m.group(1).upper() + m.group(1).lower() + ']',
proj_rgx,
)
m = re.match(proj_rgx + r'(?=-)', filename)
if m:
project = m.group(0)
rest_of_name = filename[m.end(0):]
for pkg_type, rgx in BAD_PACKAGE_BASES:
m = rgx.match(rest_of_name)
if m:
return (project, m.group('version'), pkg_type)
for pkg_type, rgx in BAD_PACKAGE_RGXN:
m = rgx.match(filename)
if m:
return (m.group('project'), m.group('version'), pkg_type)
return (None, None, None)