from typing import Dict, Iterator, List, Optional, Tuple, Union
from urllib.parse import urljoin
from warnings import warn
from bs4 import BeautifulSoup
from .classes import DistributionPackage
from .parse_repo import parse_repo_links
[docs]def parse_simple_index(
html: Union[str, bytes],
base_url: Optional[str] = None,
from_encoding: Optional[str] = None,
) -> Iterator[Tuple[str, str]]:
"""
Parse a simple repository's index page and return a generator of ``(project
name, project URL)`` pairs
.. deprecated:: 0.7.0
Use `parse_repo_index_page()` or `parse_links_stream()` instead
:param html: the HTML to parse
:type html: str or bytes
:param Optional[str] base_url: an optional URL to join to the front of the
URLs returned (usually the URL of the page being parsed)
:param Optional[str] from_encoding: an optional hint to Beautiful Soup as
to the encoding of ``html`` when it is `bytes` (usually the ``charset``
parameter of the response's :mailheader:`Content-Type` header)
:rtype: Iterator[Tuple[str, str]]
:raises UnsupportedRepoVersionError: if the repository version has a
greater major component than the supported repository version
"""
warn(
"parse_simple_index() is deprecated."
" Use parse_repo_index_page() or parse_links_stream() instead.",
DeprecationWarning,
)
for link in parse_repo_links(html, base_url, from_encoding)[1]:
yield (link.text, link.url)
[docs]def parse_project_page(
html: Union[str, bytes],
base_url: Optional[str] = None,
from_encoding: Optional[str] = None,
project_hint: Optional[str] = None,
) -> List[DistributionPackage]:
"""
Parse a project page from a simple repository and return a list of
`DistributionPackage` objects
.. deprecated:: 0.7.0
Use `parse_repo_project_page()` instead
:param html: the HTML to parse
:type html: str or bytes
:param Optional[str] base_url: an optional URL to join to the front of the
packages' URLs (usually the URL of the page being parsed)
:param Optional[str] from_encoding: an optional hint to Beautiful Soup as
to the encoding of ``html`` when it is `bytes` (usually the ``charset``
parameter of the response's :mailheader:`Content-Type` header)
:param Optional[str] project_hint: The name of the project whose page is
being parsed; used to disambiguate the parsing of certain filenames
:rtype: List[DistributionPackage]
:raises UnsupportedRepoVersionError: if the repository version has a
greater major component than the supported repository version
"""
warn(
"parse_project_page() is deprecated."
" Use parse_repo_project_page() instead.",
DeprecationWarning,
)
return [
DistributionPackage.from_link(link, project_hint)
for link in parse_repo_links(html, base_url, from_encoding)[1]
]
[docs]def parse_links(
html: Union[str, bytes],
base_url: Optional[str] = None,
from_encoding: Optional[str] = None,
) -> Iterator[Tuple[str, str, Dict[str, Union[str, List[str]]]]]:
"""
Parse an HTML page and return a generator of links, where each link is
represented as a triple of link text, link URL, and a `dict` of link tag
attributes (including the unmodified ``href`` attribute).
Link text has all leading & trailing whitespace removed.
Keys in the attributes `dict` are converted to lowercase.
.. deprecated:: 0.7.0
Use `parse_repo_links()` instead
:param html: the HTML to parse
:type html: str or bytes
:param Optional[str] base_url: an optional URL to join to the front of the
URLs returned (usually the URL of the page being parsed)
:param Optional[str] from_encoding: an optional hint to Beautiful Soup as
to the encoding of ``html`` when it is `bytes` (usually the ``charset``
parameter of the response's :mailheader:`Content-Type` header)
:rtype: Iterator[Tuple[str, str, Dict[str, Union[str, List[str]]]]]
"""
warn(
"parse_links() is deprecated. Use parse_repo_links() instead.",
DeprecationWarning,
)
soup = BeautifulSoup(html, "html.parser", from_encoding=from_encoding)
base_tag = soup.find("base", href=True)
if base_tag is not None:
if base_url is None:
base_url = base_tag["href"]
else:
base_url = urljoin(base_url, base_tag["href"])
if base_url is None:
def basejoin(url: str) -> str:
return url
else:
def basejoin(url: str) -> str:
assert isinstance(base_url, str)
return urljoin(base_url, url)
for link in soup.find_all("a", href=True):
yield (
"".join(link.strings).strip(),
basejoin(link["href"]),
link.attrs,
)