Skip to content

Commit 16c1324

Browse files
committed
Add support for lazy matchers
Add lazy builtin matchers (with a separately compiled file), as well as loading json or yaml files using lazy matchers. Lazy matchers are very much a tradeoff: they improve import speed (and memory consumption until triggered), but slow down run speed, possibly dramatically: - importing the package itself takes ~36ms - importing the lazy matchers takes ~36ms (including the package, so ~0) and ~70kB RSS - importing the eager matchers takes ~97ms and ~780kB RSS - triggering the instantiation of the lazy matchers adds ~800kB RSS - running bench on the sample file using the lazy matcher has 700~800ms overhead compared to the eager matchers While the lazy matchers are less costly across the board until they're used, benching the sample file causes the loading of *every* regex -- likely due to matching failures -- has a 700~800ms overhead over eager matchers, and increases the RSS by ~800kB (on top of the original 70). Thus lazy matchers are not a great default for the basic parser. Though they might be a good opt-in if the user only ever uses one of the domains (especially if it's not the devices one as that's by far the largest). With the re2 parser however, only 156 of the 1162 regexes get evaluated, leading to a minor CPU overhead of 20~30ms (1% of bench time) and a more reasonable memory overhead. Thus use the lazy matcher fot the re2 parser. On the more net-negative but relatively minor side of things, the pregenerated lazy matchers file adds 120k to the on-disk requirements of the library, and ~25k to the wheel archive. This is also what the _regexes and _matchers precompiled files do. pyc files seem to be even bigger (~130k) so the tradeoff is dubious even if they are slightly faster. Fixes #171, fixes #173
1 parent 04d0b7d commit 16c1324

10 files changed

Lines changed: 452 additions & 138 deletions

File tree

setup.py

Lines changed: 139 additions & 100 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
#!/usr/bin/env python
22
# flake8: noqa
33
import io
4-
from contextlib import suppress
4+
from contextlib import suppress, contextmanager
55
from os import fspath
66
from pathlib import Path
77
from typing import Optional, List, Dict
@@ -52,21 +52,6 @@ def run(self) -> None:
5252
f"Unable to find regexes.yaml, should be at {yaml_src!r}"
5353
)
5454

55-
def write_matcher(f, typ: str, fields: List[Optional[object]]):
56-
f.write(f" {typ}(".encode())
57-
while len(fields) > 1 and fields[-1] is None:
58-
fields = fields[:-1]
59-
f.write(", ".join(map(repr, fields)).encode())
60-
f.write(b"),\n")
61-
62-
def write_params(fields):
63-
# strip trailing None values
64-
while len(fields) > 1 and fields[-1] is None:
65-
fields.pop()
66-
67-
for field in fields:
68-
fp.write((f" {field!r},\n").encode())
69-
7055
with yaml_src.open("rb") as f:
7156
regexes = yaml.safe_load(f)
7257

@@ -79,96 +64,150 @@ def write_params(fields):
7964
outdir.mkdir(parents=True, exist_ok=True)
8065

8166
dest = outdir / "_matchers.py"
67+
dest_lazy = outdir / "_lazy.py"
8268
dest_legacy = outdir / "_regexes.py"
8369

84-
with dest.open("wb") as f, dest_legacy.open("wb") as fp:
85-
# fmt: off
86-
f.write(b"""\
70+
with dest.open("wb") as eager, dest_lazy.open("wb") as lazy, dest_legacy.open(
71+
"wb"
72+
) as legacy:
73+
eager = EagerWriter(eager)
74+
lazy = LazyWriter(lazy)
75+
legacy = LegacyWriter(legacy)
76+
77+
for section in ["user_agent_parsers", "os_parsers", "device_parsers"]:
78+
with eager.section(section), lazy.section(section), legacy.section(
79+
section
80+
):
81+
extract = EXTRACTORS[section]
82+
for p in regexes[section]:
83+
el = trim(extract(p))
84+
eager.item(el)
85+
lazy.item(el)
86+
legacy.item(el)
87+
eager.end()
88+
lazy.end()
89+
legacy.end()
90+
91+
92+
def trim(l):
93+
while len(l) > 1 and l[-1] is None:
94+
l.pop()
95+
return l
96+
97+
98+
EXTRACTORS = {
99+
"user_agent_parsers": lambda p: [
100+
p["regex"],
101+
p.get("family_replacement"),
102+
p.get("v1_replacement"),
103+
p.get("v2_replacement"),
104+
],
105+
"os_parsers": lambda p: [
106+
p["regex"],
107+
p.get("os_replacement"),
108+
p.get("os_v1_replacement"),
109+
p.get("os_v2_replacement"),
110+
p.get("os_v3_replacement"),
111+
p.get("os_v4_replacement"),
112+
],
113+
"device_parsers": lambda p: [
114+
p["regex"],
115+
p.get("regex_flag"),
116+
p.get("device_replacement"),
117+
p.get("brand_replacement"),
118+
p.get("model_replacement"),
119+
],
120+
}
121+
122+
123+
class Writer:
124+
section_end = b""
125+
126+
def __init__(self, fp):
127+
self.fp = fp
128+
self.fp.write(
129+
b"""\
87130
########################################################
88131
# NOTICE: this file is autogenerated from regexes.yaml #
89132
########################################################
133+
"""
134+
)
135+
self.fp.write(self.prefix)
136+
self._section = None
137+
138+
@contextmanager
139+
def section(self, id):
140+
self._section = id
141+
self.fp.write(self.sections[id])
142+
yield
143+
self.fp.write(self.section_end)
144+
145+
def item(self, elements):
146+
# DeviceMatcher(re, flag, repl1),
147+
self.fp.write(self.items[self._section])
148+
self.fp.write(", ".join(map(repr, elements)).encode())
149+
self.fp.write(b"),\n")
150+
151+
def end(self):
152+
self.fp.write(self.suffix)
153+
154+
155+
class LegacyWriter(Writer):
156+
prefix = b"""\
157+
__all__ = [
158+
"USER_AGENT_PARSERS",
159+
"DEVICE_PARSERS",
160+
"OS_PARSERS",
161+
]
162+
163+
from .user_agent_parser import UserAgentParser, DeviceParser, OSParser
164+
165+
"""
166+
sections = {
167+
"user_agent_parsers": b"USER_AGENT_PARSERS = [\n",
168+
"os_parsers": b"\n\nOS_PARSERS = [\n",
169+
"device_parsers": b"\n\nDEVICE_PARSERS = [\n",
170+
}
171+
section_end = b"]"
172+
items = {
173+
"user_agent_parsers": b" UserAgentParser(",
174+
"os_parsers": b" OSParser(",
175+
"device_parsers": b" DeviceParser(",
176+
}
177+
suffix = b"\n"
178+
179+
180+
class EagerWriter(Writer):
181+
prefix = b"""\
182+
__all__ = ["MATCHERS"]
183+
184+
from typing import Tuple, List
185+
from .core import UserAgentMatcher, OSMatcher, DeviceMatcher
186+
187+
MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([
188+
"""
189+
sections = {
190+
"user_agent_parsers": b"",
191+
"os_parsers": b"], [\n",
192+
"device_parsers": b"], [\n",
193+
}
194+
items = {
195+
"user_agent_parsers": b" UserAgentMatcher(",
196+
"os_parsers": b" OSMatcher(",
197+
"device_parsers": b" DeviceMatcher(",
198+
}
199+
suffix = b"])\n"
200+
201+
202+
class LazyWriter(EagerWriter):
203+
prefix = b"""\
204+
__all__ = ["MATCHERS"]
205+
206+
from typing import Tuple, List
207+
from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher
90208
91-
from .core import Matchers, UserAgentMatcher, OSMatcher, DeviceMatcher
92-
93-
MATCHERS: Matchers = ([
94-
""")
95-
fp.write(b"# -*- coding: utf-8 -*-\n")
96-
fp.write(b"########################################################\n")
97-
fp.write(b"# NOTICE: This file is autogenerated from regexes.yaml #\n")
98-
fp.write(b"########################################################\n")
99-
fp.write(b"\n")
100-
fp.write(b"from .user_agent_parser import (\n")
101-
fp.write(b" UserAgentParser, DeviceParser, OSParser,\n")
102-
fp.write(b")\n")
103-
fp.write(b"\n")
104-
fp.write(b"__all__ = ('USER_AGENT_PARSERS', 'DEVICE_PARSERS', 'OS_PARSERS')\n")
105-
fp.write(b"\n")
106-
fp.write(b"USER_AGENT_PARSERS = [\n")
107-
for device_parser in regexes["user_agent_parsers"]:
108-
write_matcher(f, "UserAgentMatcher", [
109-
device_parser["regex"],
110-
device_parser.get("family_replacement"),
111-
device_parser.get("v1_replacement"),
112-
device_parser.get("v2_replacement"),
113-
])
114-
115-
fp.write(b" UserAgentParser(\n")
116-
write_params([
117-
device_parser["regex"],
118-
device_parser.get("family_replacement"),
119-
device_parser.get("v1_replacement"),
120-
device_parser.get("v2_replacement"),
121-
])
122-
fp.write(b" ),\n")
123-
f.write(b" ], [\n")
124-
fp.write(b"]\n\n")
125-
126-
fp.write(b"OS_PARSERS = [\n")
127-
for device_parser in regexes["os_parsers"]:
128-
write_matcher(f, "OSMatcher", [
129-
device_parser["regex"],
130-
device_parser.get("os_replacement"),
131-
device_parser.get("os_v1_replacement"),
132-
device_parser.get("os_v2_replacement"),
133-
device_parser.get("os_v3_replacement"),
134-
device_parser.get("os_v4_replacement"),
135-
])
136-
137-
fp.write(b" OSParser(\n")
138-
write_params([
139-
device_parser["regex"],
140-
device_parser.get("os_replacement"),
141-
device_parser.get("os_v1_replacement"),
142-
device_parser.get("os_v2_replacement"),
143-
device_parser.get("os_v3_replacement"),
144-
device_parser.get("os_v4_replacement"),
145-
])
146-
fp.write(b" ),\n")
147-
f.write(b" ], [\n")
148-
fp.write(b"]\n\n")
149-
150-
fp.write(b"DEVICE_PARSERS = [\n")
151-
for device_parser in regexes["device_parsers"]:
152-
write_matcher(f, "DeviceMatcher", [
153-
device_parser["regex"],
154-
device_parser.get("regex_flag"),
155-
device_parser.get("device_replacement"),
156-
device_parser.get("brand_replacement"),
157-
device_parser.get("model_replacement"),
158-
])
159-
160-
fp.write(b" DeviceParser(\n")
161-
write_params([
162-
device_parser["regex"],
163-
device_parser.get("regex_flag"),
164-
device_parser.get("device_replacement"),
165-
device_parser.get("brand_replacement"),
166-
device_parser.get("model_replacement"),
167-
])
168-
fp.write(b" ),\n")
169-
f.write(b"])\n")
170-
fp.write(b"]\n")
171-
# fmt: on
209+
MATCHERS: Tuple[List[UserAgentMatcher], List[OSMatcher], List[DeviceMatcher]] = ([
210+
"""
172211

173212

174213
setup(

src/ua_parser/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
"UserAgent",
3737
"UserAgentMatcher",
3838
"load_builtins",
39+
"load_lazy_builtins",
3940
"load_data",
4041
"load_yaml",
4142
"parse",
@@ -65,7 +66,7 @@
6566
)
6667
from .basic import Parser as BasicParser
6768
from .caching import CachingParser, Clearing, LRU, Locking
68-
from .loaders import load_builtins, load_data, load_yaml
69+
from .loaders import load_builtins, load_lazy_builtins, load_data, load_yaml
6970

7071
Re2Parser: Optional[Callable[[Matchers], Parser]] = None
7172
with contextlib.suppress(ImportError):
@@ -79,7 +80,7 @@ def __getattr__(name: str) -> Parser:
7980
global parser
8081
if name == "parser":
8182
if Re2Parser is not None:
82-
parser = Re2Parser(load_builtins())
83+
parser = Re2Parser(load_lazy_builtins())
8384
else:
8485
parser = CachingParser(
8586
BasicParser(load_builtins()),

src/ua_parser/_lazy.pyi

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
__all__ = ["MATCHERS"]
2+
3+
from typing import Tuple, List
4+
from .lazy import UserAgentMatcher, OSMatcher, DeviceMatcher
5+
6+
MATCHERS: Tuple[
7+
List[UserAgentMatcher],
8+
List[OSMatcher],
9+
List[DeviceMatcher],
10+
]

src/ua_parser/_matchers.pyi

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,10 @@
1-
from .core import Matchers
1+
__all__ = ["MATCHERS"]
22

3-
MATCHERS: Matchers
3+
from typing import Tuple, List
4+
from .core import UserAgentMatcher, OSMatcher, DeviceMatcher
5+
6+
MATCHERS: Tuple[
7+
List[UserAgentMatcher],
8+
List[OSMatcher],
9+
List[DeviceMatcher],
10+
]

src/ua_parser/basic.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@
77
Device,
88
DeviceMatcher,
99
Domain,
10+
Matcher,
1011
Matchers,
1112
OS,
1213
OSMatcher,
@@ -23,9 +24,9 @@ class Parser(AbstractParser):
2324
when one matches.
2425
"""
2526

26-
user_agent_matchers: List[UserAgentMatcher]
27-
os_matchers: List[OSMatcher]
28-
device_matchers: List[DeviceMatcher]
27+
user_agent_matchers: List[Matcher[UserAgent]]
28+
os_matchers: List[Matcher[OS]]
29+
device_matchers: List[Matcher[Device]]
2930

3031
def __init__(
3132
self,

0 commit comments

Comments
 (0)