Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 34 additions & 33 deletions howlongtobeatpy/howlongtobeatpy/HTMLRequests.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def __extract_search_url_script(self, script_content: str):


class SearchAuthToken:
search_url = "api/s"
search_url = "api/bleed"
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This was intentionally /s so that if the "retrieve url" doesn't work it's easily noticeable because this variable was not changed

search_url_endpoint = "/init"
auth_token = None
auth_key = None
Expand Down Expand Up @@ -105,8 +105,12 @@ class HTMLRequests:
BASE_URL = 'https://howlongtobeat.com/'
REFERER_HEADER = BASE_URL
GAME_URL = BASE_URL + "game"
# Static search url to use in case it can't be extracted from JS code
SEARCH_URL = BASE_URL + "api/s/"
# Static search url used as a fallback if extraction from JS fails.
# HLTB rotates this name periodically (api/find -> api/finder -> api/bleed,
# current as of 2026-05). The runtime extraction in
# send_website_request_getcode is the source of truth — this is just
# a backstop.
SEARCH_URL = BASE_URL + "api/bleed"
Copy link
Copy Markdown
Owner

@ScrappyCocco ScrappyCocco May 7, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same reason as above + remove the comments
I think both here and above no change is required

HTML_PARSER = 'html.parser'

@staticmethod
Expand Down Expand Up @@ -347,21 +351,23 @@ def send_website_request_getcode(parse_all_scripts: bool, user_agent):
"""
Function that send a request to howlongtobeat to scrape the correct search url
@return: The search informations to use in the request

Note: ``parse_all_scripts`` is kept for backward compatibility but no
longer changes which scripts are inspected. HLTB used to bundle the
relevant code under ``_app-*.js``, but the modern (Turbopack) build
emits opaque chunk names like ``0-~-0up.q3_p0.js``, so a name-based
filter is no longer reliable — we iterate every ``<script src>`` tag
and stop at the first one that yields a ``search_url``.
"""
# Make the post request and return the result if is valid
headers = HTMLRequests.get_title_request_headers(user_agent)
resp = requests.get(HTMLRequests.BASE_URL, headers=headers, timeout=60)
if resp.status_code == 200 and resp.text is not None:
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(resp.text, HTMLRequests.HTML_PARSER)
# Find all <script> tags with a src attribute containing the substring
scripts = soup.find_all('script', src=True)
if parse_all_scripts:
matching_scripts = [script['src'] for script in scripts]
else:
matching_scripts = [script['src'] for script in scripts if '_app-' in script['src']]
for script_url in matching_scripts:
script_url = HTMLRequests.BASE_URL + script_url
for script in scripts:
script_url = HTMLRequests.BASE_URL + script['src']
script_resp = requests.get(script_url, headers=headers, timeout=60)
if script_resp.status_code == 200 and script_resp.text is not None:
search_info = SearchInformations(script_resp.text)
Expand All @@ -374,36 +380,31 @@ async def async_send_website_request_getcode(parse_all_scripts: bool, user_agent
"""
Function that send a request to howlongtobeat to scrape the correct search url
@return: The search informations to use in the request

See ``send_website_request_getcode`` for why ``parse_all_scripts`` is
no longer used.
"""
# Make the post request and return the result if is valid
headers = HTMLRequests.get_title_request_headers(user_agent)
timeout = aiohttp.ClientTimeout(total=60)
async with aiohttp.ClientSession() as session:
async with session.get(HTMLRequests.BASE_URL, headers=headers, timeout=timeout) as resp:
if resp is not None and resp.status == 200:
resp_text = await resp.text()
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(resp_text, HTMLRequests.HTML_PARSER)
# Find all <script> tags with a src attribute containing the substring
scripts = soup.find_all('script', src=True)
if parse_all_scripts:
matching_scripts = [script['src'] for script in scripts]
else:
matching_scripts = [script['src'] for script in scripts if '_app-' in script['src']]
for script_url in matching_scripts:
script_url = HTMLRequests.BASE_URL + script_url
async with aiohttp.ClientSession() as session:
async with session.get(script_url, headers=headers, timeout=timeout) as script_resp:
if script_resp is not None and resp.status == 200:
script_resp_text = await script_resp.text()
search_info = SearchInformations(script_resp_text)
if search_info.search_url is not None:
# The api key is necessary
return search_info
else:
return None
else:
if resp is None or resp.status != 200:
return None
resp_text = await resp.text()
Copy link
Copy Markdown
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a nice change but it inverted the "if" condition for no reason, redo the condition as before please so the diff is more noticeable

soup = BeautifulSoup(resp_text, HTMLRequests.HTML_PARSER)
scripts = soup.find_all('script', src=True)
for script in scripts:
script_url = HTMLRequests.BASE_URL + script['src']
async with aiohttp.ClientSession() as inner_session:
async with inner_session.get(script_url, headers=headers, timeout=timeout) as script_resp:
if script_resp is None or script_resp.status != 200:
continue
script_resp_text = await script_resp.text()
search_info = SearchInformations(script_resp_text)
if search_info.search_url is not None:
return search_info
return None

@staticmethod
def get_auth_token_request_params():
Expand Down
2 changes: 1 addition & 1 deletion howlongtobeatpy/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
long_description = fh.read()

setup(name='howlongtobeatpy',
version='1.0.21',
version='1.0.22',
packages=find_packages(exclude=['tests']),
description='A Python API for How Long to Beat',
long_description=long_description,
Expand Down
49 changes: 49 additions & 0 deletions howlongtobeatpy/tests/test_search_url_extraction.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
"""
Hermetic unit tests for the search-URL discovery regex.

These don't hit the network and are intended to catch silent regressions
when HLTB rotates their endpoint name (which they do periodically — most
recently from /api/finder to /api/bleed).
"""
from unittest import TestCase

from howlongtobeatpy.HTMLRequests import SearchInformations


# Real shape captured from HLTB's Turbopack chunk on 2026-05-07. If HLTB
# rotates the endpoint name again, update this fixture to the new shape and
# the tests should still pass without code changes.
BLEED_CHUNK_SHAPE = (
'...he:!(u?.user_id>0)};a&&(s[a]=l);'
'let i=await fetch("/api/bleed",{method:"POST",'
'headers:{"Content-Type":"application/json",'
'"x-auth-token":t,"x-hp-key":a,"x-hp-val":l},'
'body:JSON.stringify(s)});'
'if(403===i.status&&!e){...'
)


class TestSearchUrlExtraction(TestCase):

def test_extracts_current_api_bleed_endpoint(self):
info = SearchInformations(BLEED_CHUNK_SHAPE)
self.assertEqual("api/bleed", info.search_url)

def test_extracts_a_hypothetical_future_endpoint(self):
future = BLEED_CHUNK_SHAPE.replace("/api/bleed", "/api/somethingnew")
info = SearchInformations(future)
self.assertEqual("api/somethingnew", info.search_url)

def test_returns_none_when_no_post_fetch_present(self):
info = SearchInformations('var x = 1; console.log("hello")')
self.assertIsNone(info.search_url)

def test_ignores_get_only_fetches(self):
get_only = 'fetch("/api/bleed",{method:"GET"})'
info = SearchInformations(get_only)
self.assertIsNone(info.search_url)

def test_extracts_root_path_from_versioned_endpoint(self):
versioned = BLEED_CHUNK_SHAPE.replace("/api/bleed", "/api/bleed/v2")
info = SearchInformations(versioned)
self.assertEqual("api/bleed", info.search_url)
Loading