Skip to content

Commit e719a7e

Browse files
committed
Add an re2-based parser
Requires splitting out some of the testenvs, as re2 is not available for pypy at all, and not yet for 3.12. Uses `re2.Filter`, which unlike the C++ `FilteredRE2` bundles prefiltering, using an `re2.Set` so likely less efficient than providing one's own e.g. aho-corasick, but avoids having to do that. At first glance according to pytest's `--durations 0` this is quite successful (unlike using `re2.Set` which was more of a mixed bag): ``` 2.54s call tests/test_core.py::test_devices[test_device.yaml-basic] 2.51s call tests/test_core.py::test_ua[pgts_browser_list.yaml-basic] 2.48s call tests/test_legacy.py::TestParse::testPGTSStrings 2.43s call tests/test_legacy.py::TestParse::testStringsDevice 0.95s call tests/test_core.py::test_devices[test_device.yaml-re2] 0.55s call tests/test_core.py::test_ua[pgts_browser_list.yaml-re2] 0.18s call tests/test_core.py::test_ua[test_ua.yaml-basic] 0.16s call tests/test_legacy.py::TestParse::testBrowserscopeStrings 0.10s call tests/test_core.py::test_ua[test_ua.yaml-re2] ``` While the "basic" parser for the new API is slightly slower than the legacy API (browserscope does use test_ua.yaml so that matches) the re2 parser is significantly faster than both: - 60% faster on test_device.yaml (~2.5s -> 1s) - 80% faster on pgts (2.5s -> 0.5s) - 40% faster on browserscope (0.16 -> 0.1) This is very encouraging, altough the memory consumption has not been checked (yet). Fixes #149, kind-of
1 parent b408a13 commit e719a7e

4 files changed

Lines changed: 84 additions & 1 deletion

File tree

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ version = "1.0.0a1"
99
readme = "README.rst"
1010
requires-python = ">=3.8"
1111
dependencies = []
12-
optional-dependencies = { yaml = ["PyYaml"] }
12+
optional-dependencies = { yaml = ["PyYaml"], re2 = ["google-re2"] }
1313

1414
license = {text = "Apache 2.0"}
1515
urls = {repository = "https://github.com/ua-parser/uap-python"}

src/ua_parser/re2.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,74 @@
1+
from __future__ import annotations
2+
3+
import io
4+
import os
5+
import re
6+
from typing import List, Tuple, Union
7+
8+
import re2 # type: ignore
9+
10+
from .core import (
11+
Parser as AbstractParser,
12+
PartialParseResult,
13+
Device,
14+
Domain,
15+
OS,
16+
UserAgent,
17+
UserAgentMatcher,
18+
OSMatcher,
19+
DeviceMatcher,
20+
)
21+
22+
23+
class Parser(AbstractParser):
24+
ua: re2.Filter
25+
user_agent_matchers: List[UserAgentMatcher]
26+
os: re2.Filter
27+
os_matchers: List[OSMatcher]
28+
devices: re2.Filter
29+
device_matchers: List[DeviceMatcher]
30+
31+
def __init__(
32+
self,
33+
matchers: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]],
34+
) -> None:
35+
self.user_agent_matchers, self.os_matchers, self.device_matchers = matchers
36+
37+
self.ua = re2.Filter()
38+
for u in self.user_agent_matchers:
39+
self.ua.Add(u.regex.pattern)
40+
self.ua.Compile()
41+
42+
self.os = re2.Filter()
43+
for o in self.os_matchers:
44+
self.os.Add(o.regex.pattern)
45+
self.os.Compile()
46+
47+
self.devices = re2.Filter()
48+
for d in self.device_matchers:
49+
# Prepend the i global flag if IGNORECASE is set. Assumes
50+
# no pattern uses global flags, but since they're not
51+
# supported in JS that seems safe.
52+
if d.regex.flags & re.IGNORECASE:
53+
self.devices.Add("(?i)" + d.regex.pattern)
54+
else:
55+
self.devices.Add(d.regex.pattern)
56+
self.devices.Compile()
57+
58+
def __call__(self, ua: str, domains: Domain, /) -> PartialParseResult:
59+
user_agent = os = device = None
60+
if Domain.USER_AGENT in domains:
61+
if matches := self.ua.Match(ua):
62+
# Set/Filter does not return the match in index order
63+
# (position order?) so to fit UAP semantics we need to
64+
# extract the first matching regex (lowest index).
65+
user_agent = self.user_agent_matchers[min(matches)](ua)
66+
if Domain.OS in domains:
67+
if matches := self.os.Match(ua):
68+
os = self.os_matchers[min(matches)](ua)
69+
if Domain.DEVICE in domains:
70+
if matches := self.devices.Match(ua):
71+
device = self.device_matchers[min(matches)](ua)
72+
return PartialParseResult(
73+
domains=domains, string=ua, user_agent=user_agent, os=os, device=device
74+
)

tests/test_core.py

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,10 @@
5353
id="lru",
5454
),
5555
]
56+
with contextlib.suppress(ImportError):
57+
from ua_parser import re2
5658

59+
PARSERS.append(pytest.param(re2.Parser(load_builtins()), id="re2"))
5760

5861
UA_FIELDS = {f.name for f in dataclasses.fields(UserAgent)}
5962

tox.ini

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,9 +19,15 @@ wheel_build_env = .pkg
1919
deps =
2020
pytest
2121
pyyaml
22+
google-re2
2223
commands =
2324
pytest -Werror --doctest-glob="*.rst" {posargs}
2425

26+
[testenv:pypy3.{8,9,10},py312]
27+
deps =
28+
pytest
29+
pyyaml
30+
2531
[testenv:flake8]
2632
package = skip
2733
deps = flake8

0 commit comments

Comments
 (0)