-
Notifications
You must be signed in to change notification settings - Fork 9
Fix #57: drop dead _app- filter and refresh hardcoded /api/bleed fallbacks #58
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -75,7 +75,7 @@ def __extract_search_url_script(self, script_content: str): | |
|
|
||
|
|
||
| class SearchAuthToken: | ||
| search_url = "api/s" | ||
| search_url = "api/bleed" | ||
| search_url_endpoint = "/init" | ||
| auth_token = None | ||
| auth_key = None | ||
|
|
@@ -105,8 +105,12 @@ class HTMLRequests: | |
| BASE_URL = 'https://howlongtobeat.com/' | ||
| REFERER_HEADER = BASE_URL | ||
| GAME_URL = BASE_URL + "game" | ||
| # Static search url to use in case it can't be extracted from JS code | ||
| SEARCH_URL = BASE_URL + "api/s/" | ||
| # Static search url used as a fallback if extraction from JS fails. | ||
| # HLTB rotates this name periodically (api/find -> api/finder -> api/bleed, | ||
| # current as of 2026-05). The runtime extraction in | ||
| # send_website_request_getcode is the source of truth — this is just | ||
| # a backstop. | ||
| SEARCH_URL = BASE_URL + "api/bleed" | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same reason as above + remove the comments |
||
| HTML_PARSER = 'html.parser' | ||
|
|
||
| @staticmethod | ||
|
|
@@ -347,21 +351,23 @@ def send_website_request_getcode(parse_all_scripts: bool, user_agent): | |
| """ | ||
| Function that send a request to howlongtobeat to scrape the correct search url | ||
| @return: The search informations to use in the request | ||
|
|
||
| Note: ``parse_all_scripts`` is kept for backward compatibility but no | ||
| longer changes which scripts are inspected. HLTB used to bundle the | ||
| relevant code under ``_app-*.js``, but the modern (Turbopack) build | ||
| emits opaque chunk names like ``0-~-0up.q3_p0.js``, so a name-based | ||
| filter is no longer reliable — we iterate every ``<script src>`` tag | ||
| and stop at the first one that yields a ``search_url``. | ||
| """ | ||
| # Make the post request and return the result if is valid | ||
| headers = HTMLRequests.get_title_request_headers(user_agent) | ||
| resp = requests.get(HTMLRequests.BASE_URL, headers=headers, timeout=60) | ||
| if resp.status_code == 200 and resp.text is not None: | ||
| # Parse the HTML content using BeautifulSoup | ||
| soup = BeautifulSoup(resp.text, HTMLRequests.HTML_PARSER) | ||
| # Find all <script> tags with a src attribute containing the substring | ||
| scripts = soup.find_all('script', src=True) | ||
| if parse_all_scripts: | ||
| matching_scripts = [script['src'] for script in scripts] | ||
| else: | ||
| matching_scripts = [script['src'] for script in scripts if '_app-' in script['src']] | ||
| for script_url in matching_scripts: | ||
| script_url = HTMLRequests.BASE_URL + script_url | ||
| for script in scripts: | ||
| script_url = HTMLRequests.BASE_URL + script['src'] | ||
| script_resp = requests.get(script_url, headers=headers, timeout=60) | ||
| if script_resp.status_code == 200 and script_resp.text is not None: | ||
| search_info = SearchInformations(script_resp.text) | ||
|
|
@@ -374,36 +380,31 @@ async def async_send_website_request_getcode(parse_all_scripts: bool, user_agent | |
| """ | ||
| Function that send a request to howlongtobeat to scrape the correct search url | ||
| @return: The search informations to use in the request | ||
|
|
||
| See ``send_website_request_getcode`` for why ``parse_all_scripts`` is | ||
| no longer used. | ||
| """ | ||
| # Make the post request and return the result if is valid | ||
| headers = HTMLRequests.get_title_request_headers(user_agent) | ||
| timeout = aiohttp.ClientTimeout(total=60) | ||
| async with aiohttp.ClientSession() as session: | ||
| async with session.get(HTMLRequests.BASE_URL, headers=headers, timeout=timeout) as resp: | ||
| if resp is not None and resp.status == 200: | ||
| resp_text = await resp.text() | ||
| # Parse the HTML content using BeautifulSoup | ||
| soup = BeautifulSoup(resp_text, HTMLRequests.HTML_PARSER) | ||
| # Find all <script> tags with a src attribute containing the substring | ||
| scripts = soup.find_all('script', src=True) | ||
| if parse_all_scripts: | ||
| matching_scripts = [script['src'] for script in scripts] | ||
| else: | ||
| matching_scripts = [script['src'] for script in scripts if '_app-' in script['src']] | ||
| for script_url in matching_scripts: | ||
| script_url = HTMLRequests.BASE_URL + script_url | ||
| async with aiohttp.ClientSession() as session: | ||
| async with session.get(script_url, headers=headers, timeout=timeout) as script_resp: | ||
| if script_resp is not None and resp.status == 200: | ||
| script_resp_text = await script_resp.text() | ||
| search_info = SearchInformations(script_resp_text) | ||
| if search_info.search_url is not None: | ||
| # The api key is necessary | ||
| return search_info | ||
| else: | ||
| return None | ||
| else: | ||
| if resp is None or resp.status != 200: | ||
| return None | ||
| resp_text = await resp.text() | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a nice change but it inverted the "if" condition for no reason, redo the condition as before please so the diff is more noticeable |
||
| soup = BeautifulSoup(resp_text, HTMLRequests.HTML_PARSER) | ||
| scripts = soup.find_all('script', src=True) | ||
| for script in scripts: | ||
| script_url = HTMLRequests.BASE_URL + script['src'] | ||
| async with aiohttp.ClientSession() as inner_session: | ||
| async with inner_session.get(script_url, headers=headers, timeout=timeout) as script_resp: | ||
| if script_resp is None or script_resp.status != 200: | ||
| continue | ||
| script_resp_text = await script_resp.text() | ||
| search_info = SearchInformations(script_resp_text) | ||
| if search_info.search_url is not None: | ||
| return search_info | ||
| return None | ||
|
|
||
| @staticmethod | ||
| def get_auth_token_request_params(): | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,49 @@ | ||
| """ | ||
| Hermetic unit tests for the search-URL discovery regex. | ||
|
|
||
| These don't hit the network and are intended to catch silent regressions | ||
| when HLTB rotates their endpoint name (which they do periodically — most | ||
| recently from /api/finder to /api/bleed). | ||
| """ | ||
| from unittest import TestCase | ||
|
|
||
| from howlongtobeatpy.HTMLRequests import SearchInformations | ||
|
|
||
|
|
||
| # Real shape captured from HLTB's Turbopack chunk on 2026-05-07. If HLTB | ||
| # rotates the endpoint name again, update this fixture to the new shape and | ||
| # the tests should still pass without code changes. | ||
| BLEED_CHUNK_SHAPE = ( | ||
| '...he:!(u?.user_id>0)};a&&(s[a]=l);' | ||
| 'let i=await fetch("/api/bleed",{method:"POST",' | ||
| 'headers:{"Content-Type":"application/json",' | ||
| '"x-auth-token":t,"x-hp-key":a,"x-hp-val":l},' | ||
| 'body:JSON.stringify(s)});' | ||
| 'if(403===i.status&&!e){...' | ||
| ) | ||
|
|
||
|
|
||
| class TestSearchUrlExtraction(TestCase): | ||
|
|
||
| def test_extracts_current_api_bleed_endpoint(self): | ||
| info = SearchInformations(BLEED_CHUNK_SHAPE) | ||
| self.assertEqual("api/bleed", info.search_url) | ||
|
|
||
| def test_extracts_a_hypothetical_future_endpoint(self): | ||
| future = BLEED_CHUNK_SHAPE.replace("/api/bleed", "/api/somethingnew") | ||
| info = SearchInformations(future) | ||
| self.assertEqual("api/somethingnew", info.search_url) | ||
|
|
||
| def test_returns_none_when_no_post_fetch_present(self): | ||
| info = SearchInformations('var x = 1; console.log("hello")') | ||
| self.assertIsNone(info.search_url) | ||
|
|
||
| def test_ignores_get_only_fetches(self): | ||
| get_only = 'fetch("/api/bleed",{method:"GET"})' | ||
| info = SearchInformations(get_only) | ||
| self.assertIsNone(info.search_url) | ||
|
|
||
| def test_extracts_root_path_from_versioned_endpoint(self): | ||
| versioned = BLEED_CHUNK_SHAPE.replace("/api/bleed", "/api/bleed/v2") | ||
| info = SearchInformations(versioned) | ||
| self.assertEqual("api/bleed", info.search_url) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This was intentionally /s so that if the "retrieve url" doesn't work it's easily noticeable because this variable was not changed