from __future__ import annotations
import re
from typing import Optional
from .errors import UnparsableFilenameError
PROJECT_NAME = r"[A-Za-z0-9](?:[A-Za-z0-9._-]*[A-Za-z0-9])?"
PROJECT_NAME_NODASH = r"[A-Za-z0-9](?:[A-Za-z0-9._]*[A-Za-z0-9])?"
VERSION = r"[A-Za-z0-9_.!+-]+?"
VERSION_NODASH = r"[A-Za-z0-9_.!+]+?"
ARCHIVE_EXT = r"\.(?:tar|tar\.(?:bz2|gz|lz|lzma|xz|Z)|tbz|tgz|tlz|txz|zip)"
PLAT_NAME = r"(?:aix|cygwin|darwin|linux|macosx|solaris|sunos|[wW]in)[-.A-Za-z0-9_]*"
PYVER = r"py[0-9]+\.[0-9]+"
#: Regexes for package filenames that can be parsed unambiguously
GOOD_PACKAGE_RGXN = [
# See <https://setuptools.readthedocs.io/en/latest
# /formats.html#filename-embedded-metadata>:
(
"egg",
re.compile(
r"^(?P<project>{})-(?P<version>{})(?:-{}(?:-{})?)?\.egg$".format(
PROJECT_NAME_NODASH, VERSION_NODASH, PYVER, PLAT_NAME
)
),
),
# See <http://ftp.rpm.org/max-rpm/ch-rpm-file-format.html>:
# (The architecture pattern is mainly just a guess based on what's
# currently on PyPI.)
(
"rpm",
re.compile(
r"^(?P<project>{})-(?P<version>{})-[^-]+\.[A-Za-z0-9._]+\.rpm$".format(
PROJECT_NAME, VERSION_NODASH
)
),
),
# Regex adapted from <https://github.com/pypa/pip/blob/18.0/src/pip/_internal/wheel.py#L569>:
(
"wheel",
re.compile(
r"^(?P<project>{})-(?P<version>{})(-[0-9][^-]*?)?"
r"-.+?-.+?-.+?\.whl$".format(PROJECT_NAME_NODASH, VERSION_NODASH)
),
),
]
#: Partial regexes for package filenames with ambiguous grammars. If a hint as
#: to the expected project name is given, it will be prepended to the regexes
#: when trying to determine a match; otherwise, a generic pattern that matches
#: all project names will be prepended.
BAD_PACKAGE_BASES = [
# See <https://github.com/python/cpython/blob/v3.7.0/Lib/distutils/command/bdist_dumb.py#L93>:
(
"dumb",
re.compile(r"-(?P<version>{})\.{}{}$".format(VERSION, PLAT_NAME, ARCHIVE_EXT)),
),
# See <https://github.com/python/cpython/blob/v3.7.0/Lib/distutils/command/bdist_msi.py#L733>:
(
"msi",
re.compile(
r"-(?P<version>{})\.{}(?:-{})?\.msi$".format(VERSION, PLAT_NAME, PYVER)
),
),
("sdist", re.compile(r"-(?P<version>{}){}$".format(VERSION, ARCHIVE_EXT))),
# See <https://github.com/python/cpython/blob/v3.7.0/Lib/distutils/command/bdist_wininst.py#L292>:
(
"wininst",
re.compile(
r"-(?P<version>{})\.{}(?:-{})?\.exe$".format(VERSION, PLAT_NAME, PYVER)
),
),
]
#: Regexes for package filenames with ambiguous grammars, using a generic
#: pattern that matches all project names
BAD_PACKAGE_RGXN = [
(pkg_type, re.compile("^(?P<project>" + PROJECT_NAME + ")" + rgx.pattern))
for pkg_type, rgx in BAD_PACKAGE_BASES
]
[docs]def parse_filename(
filename: str, project_hint: Optional[str] = None
) -> tuple[str, str, str]:
"""
Given the filename of a distribution package, returns a triple of the
project name, project version, and package type. The name and version are
spelled the same as they appear in the filename; no normalization is
performed.
The package type may be any of the following strings:
- ``'dumb'``
- ``'egg'``
- ``'msi'``
- ``'rpm'``
- ``'sdist'``
- ``'wheel'``
- ``'wininst'``
Note that some filenames (e.g., :file:`1-2-3.tar.gz`) may be ambiguous as
to which part is the project name and which is the version. In order to
resolve the ambiguity, the expected value for the project name (*modulo*
normalization) can be supplied as the ``project_name`` argument to the
function. If the filename can be parsed with the given string in the role
of the project name, the results of that parse will be returned; otherwise,
the function will fall back to breaking the project & version apart at an
unspecified point.
.. versionchanged:: 1.0.0
Now raises `UnparsableFilenameError` for unparsable filenames instead
of returning all `None`\\s
:param str filename: The package filename to parse
:param Optional[str] project_hint: Optionally, the expected value for the
project name (usually the name of the project page on which the
filename was found). The name does not need to be normalized.
:rtype: tuple[str, str, str]
:raises UnparsableFilenameError: if the filename cannot be parsed
"""
for pkg_type, rgx in GOOD_PACKAGE_RGXN:
m = rgx.match(filename)
if m:
return (m.group("project"), m.group("version"), pkg_type)
if project_hint is not None:
proj_rgx = re.sub(r"[^A-Za-z0-9]+", "[-_.]+", project_hint)
proj_rgx = re.sub(
r"([A-Za-z])",
lambda m: "[" + m.group(1).upper() + m.group(1).lower() + "]",
proj_rgx,
)
m = re.match(proj_rgx + r"(?=-)", filename)
if m:
project = m.group(0)
rest_of_name = filename[m.end(0) :]
for pkg_type, rgx in BAD_PACKAGE_BASES:
m = rgx.match(rest_of_name)
if m:
return (project, m.group("version"), pkg_type)
for pkg_type, rgx in BAD_PACKAGE_RGXN:
m = rgx.match(filename)
if m:
return (m.group("project"), m.group("version"), pkg_type)
raise UnparsableFilenameError(filename)