Skip to content

Commit ec13e1b

Browse files
committed
Add wildcard ContentSets to avoid performance problems
1 parent e877929 commit ec13e1b

7 files changed

Lines changed: 163 additions & 106 deletions

File tree

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPrivate.qll

Lines changed: 58 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -753,7 +753,7 @@ predicate jumpStepNotSharedWithTypeTracker(Node nodeFrom, Node nodeTo) {
753753
* As of 2024-04-02 the type-tracking library only supports precise content, so there is
754754
* no reason to include steps for list content right now.
755755
*/
756-
predicate storeStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
756+
predicate storeStepCommon(Node nodeFrom, Content c, Node nodeTo) {
757757
tupleStoreStep(nodeFrom, c, nodeTo)
758758
or
759759
dictStoreStep(nodeFrom, c, nodeTo)
@@ -767,29 +767,31 @@ predicate storeStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
767767
* Holds if data can flow from `nodeFrom` to `nodeTo` via an assignment to
768768
* content `c`.
769769
*/
770-
predicate storeStep(Node nodeFrom, ContentSet c, Node nodeTo) {
771-
storeStepCommon(nodeFrom, c, nodeTo)
772-
or
773-
listStoreStep(nodeFrom, c, nodeTo)
774-
or
775-
setStoreStep(nodeFrom, c, nodeTo)
776-
or
777-
attributeStoreStep(nodeFrom, c, nodeTo)
778-
or
779-
matchStoreStep(nodeFrom, c, nodeTo)
780-
or
781-
any(Orm::AdditionalOrmSteps es).storeStep(nodeFrom, c, nodeTo)
770+
predicate storeStep(Node nodeFrom, ContentSet cs, Node nodeTo) {
771+
exists(Content c | cs = singleton(c) |
772+
storeStepCommon(nodeFrom, c, nodeTo)
773+
or
774+
listStoreStep(nodeFrom, c, nodeTo)
775+
or
776+
setStoreStep(nodeFrom, c, nodeTo)
777+
or
778+
attributeStoreStep(nodeFrom, c, nodeTo)
779+
or
780+
matchStoreStep(nodeFrom, c, nodeTo)
781+
or
782+
any(Orm::AdditionalOrmSteps es).storeStep(nodeFrom, c, nodeTo)
783+
or
784+
synthStarArgsElementParameterNodeStoreStep(nodeFrom, c, nodeTo)
785+
or
786+
synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo)
787+
or
788+
yieldStoreStep(nodeFrom, c, nodeTo)
789+
or
790+
VariableCapture::storeStep(nodeFrom, c, nodeTo)
791+
)
782792
or
783-
FlowSummaryImpl::Private::Steps::summaryStoreStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), c,
793+
FlowSummaryImpl::Private::Steps::summaryStoreStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), cs,
784794
nodeTo.(FlowSummaryNode).getSummaryNode())
785-
or
786-
synthStarArgsElementParameterNodeStoreStep(nodeFrom, c, nodeTo)
787-
or
788-
synthDictSplatArgumentNodeStoreStep(nodeFrom, c, nodeTo)
789-
or
790-
yieldStoreStep(nodeFrom, c, nodeTo)
791-
or
792-
VariableCapture::storeStep(nodeFrom, c, nodeTo)
793795
}
794796

795797
/**
@@ -985,7 +987,7 @@ predicate attributeStoreStep(Node nodeFrom, AttributeContent c, Node nodeTo) {
985987
/**
986988
* Subset of `readStep` that should be shared with type-tracking.
987989
*/
988-
predicate readStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
990+
predicate readStepCommon(Node nodeFrom, Content c, Node nodeTo) {
989991
subscriptReadStep(nodeFrom, c, nodeTo)
990992
or
991993
iterableUnpackingReadStep(nodeFrom, c, nodeTo)
@@ -994,23 +996,25 @@ predicate readStepCommon(Node nodeFrom, ContentSet c, Node nodeTo) {
994996
/**
995997
* Holds if data can flow from `nodeFrom` to `nodeTo` via a read of content `c`.
996998
*/
997-
predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) {
998-
readStepCommon(nodeFrom, c, nodeTo)
999-
or
1000-
matchReadStep(nodeFrom, c, nodeTo)
1001-
or
1002-
forReadStep(nodeFrom, c, nodeTo)
1003-
or
1004-
attributeReadStep(nodeFrom, c, nodeTo)
999+
predicate readStep(Node nodeFrom, ContentSet cs, Node nodeTo) {
1000+
exists(Content c | cs = singleton(c) |
1001+
readStepCommon(nodeFrom, c, nodeTo)
1002+
or
1003+
matchReadStep(nodeFrom, c, nodeTo)
1004+
or
1005+
forReadStep(nodeFrom, c, nodeTo)
1006+
or
1007+
attributeReadStep(nodeFrom, c, nodeTo)
1008+
or
1009+
synthDictSplatParameterNodeReadStep(nodeFrom, c, nodeTo)
1010+
or
1011+
VariableCapture::readStep(nodeFrom, c, nodeTo)
1012+
)
10051013
or
1006-
FlowSummaryImpl::Private::Steps::summaryReadStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), c,
1014+
FlowSummaryImpl::Private::Steps::summaryReadStep(nodeFrom.(FlowSummaryNode).getSummaryNode(), cs,
10071015
nodeTo.(FlowSummaryNode).getSummaryNode())
10081016
or
1009-
synthDictSplatParameterNodeReadStep(nodeFrom, c, nodeTo)
1010-
or
1011-
VariableCapture::readStep(nodeFrom, c, nodeTo)
1012-
or
1013-
Conversions::readStep(nodeFrom, c, nodeTo)
1017+
Conversions::readStep(nodeFrom, cs, nodeTo)
10141018
}
10151019

10161020
/** Data flows from a sequence to a subscript of the sequence. */
@@ -1074,23 +1078,15 @@ module Conversions {
10741078
nodeFrom = decoding.getAnInput() and
10751079
nodeTo = decoding.getOutput()
10761080
) and
1077-
(
1078-
c instanceof TupleElementContent
1079-
or
1080-
c instanceof DictionaryElementContent
1081-
)
1081+
(c.isAnyTupleElement() or c.isAnyDictionaryElement())
10821082
}
10831083

10841084
predicate encoderReadStep(Node nodeFrom, ContentSet c, Node nodeTo) {
10851085
exists(Encoding encoding |
10861086
nodeFrom = encoding.getAnInput() and
10871087
nodeTo = encoding.getOutput()
10881088
) and
1089-
(
1090-
c instanceof TupleElementContent
1091-
or
1092-
c instanceof DictionaryElementContent
1093-
)
1089+
(c.isAnyTupleElement() or c.isAnyDictionaryElement())
10941090
}
10951091

10961092
predicate formatReadStep(Node nodeFrom, ContentSet c, Node nodeTo) {
@@ -1099,13 +1095,13 @@ module Conversions {
10991095
fmt.getOp() instanceof Mod and
11001096
fmt.getRight() = nodeFrom.asCfgNode()
11011097
) and
1102-
c instanceof TupleElementContent
1098+
c.isAnyTupleElement()
11031099
or
11041100
// format_map
11051101
// see https://docs.python.org/3/library/stdtypes.html#str.format_map
11061102
nodeTo.(MethodCallNode).calls(_, "format_map") and
11071103
nodeTo.(MethodCallNode).getArg(0) = nodeFrom and
1108-
c instanceof DictionaryElementContent
1104+
c.isAnyDictionaryElement()
11091105
}
11101106

11111107
predicate readStep(Node nodeFrom, ContentSet c, Node nodeTo) {
@@ -1122,18 +1118,20 @@ module Conversions {
11221118
* any value stored inside `f` is cleared at the pre-update node associated with `x`
11231119
* in `x.f = newValue`.
11241120
*/
1125-
predicate clearsContent(Node n, ContentSet c) {
1126-
matchClearStep(n, c)
1127-
or
1128-
attributeClearStep(n, c)
1129-
or
1130-
dictClearStep(n, c)
1131-
or
1132-
FlowSummaryImpl::Private::Steps::summaryClearsContent(n.(FlowSummaryNode).getSummaryNode(), c)
1133-
or
1134-
dictSplatParameterNodeClearStep(n, c)
1121+
predicate clearsContent(Node n, ContentSet cs) {
1122+
exists(Content c | cs = singleton(c) |
1123+
matchClearStep(n, c)
1124+
or
1125+
attributeClearStep(n, c)
1126+
or
1127+
dictClearStep(n, c)
1128+
or
1129+
dictSplatParameterNodeClearStep(n, c)
1130+
or
1131+
VariableCapture::clearsContent(n, c)
1132+
)
11351133
or
1136-
VariableCapture::clearsContent(n, c)
1134+
FlowSummaryImpl::Private::Steps::summaryClearsContent(n.(FlowSummaryNode).getSummaryNode(), cs)
11371135
}
11381136

11391137
/**

python/ql/lib/semmle/python/dataflow/new/internal/DataFlowPublic.qll

Lines changed: 50 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -898,19 +898,65 @@ class CapturedVariableContent extends Content, TCapturedVariableContent {
898898
override string getMaDRepresentation() { none() }
899899
}
900900

901+
/**
902+
* An entity that represents a set of `Content`s.
903+
*
904+
* Most `ContentSet`s are singletons (i.e. they consist of a single `Content`),
905+
* but `AnyDictionaryElement` and `AnyTupleElement` act as wildcards on the
906+
* read side: a read at such a `ContentSet` matches any specific dictionary
907+
* key / tuple index store, as well as (for dictionaries) the
908+
* "unknown-bucket" Content `DictionaryElementAnyContent`.
909+
*
910+
* Keeping these as wildcard `ContentSet`s (rather than enumerating one
911+
* `ContentSet` per key/index) keeps the dataflow `readSetEx` relation small
912+
* when implicit reads are used (e.g. at sinks via `defaultImplicitTaintRead`).
913+
*/
914+
private newtype TContentSet =
915+
TSingletonContent(Content c) or
916+
TAnyTupleElement() or
917+
TAnyDictionaryElement()
918+
901919
/**
902920
* An entity that represents a set of `Content`s.
903921
*
904922
* The set may be interpreted differently depending on whether it is
905923
* stored into (`getAStoreContent`) or read from (`getAReadContent`).
906924
*/
907-
class ContentSet instanceof Content {
925+
class ContentSet extends TContentSet {
926+
/** Holds if this content set is the singleton `{c}`. */
927+
predicate isSingleton(Content c) { this = TSingletonContent(c) }
928+
929+
/** Holds if this content set is the wildcard for all tuple elements. */
930+
predicate isAnyTupleElement() { this = TAnyTupleElement() }
931+
932+
/** Holds if this content set is the wildcard for all dictionary elements. */
933+
predicate isAnyDictionaryElement() { this = TAnyDictionaryElement() }
934+
908935
/** Gets a content that may be stored into when storing into this set. */
909-
Content getAStoreContent() { result = this }
936+
Content getAStoreContent() { this = TSingletonContent(result) }
910937

911938
/** Gets a content that may be read from when reading from this set. */
912-
Content getAReadContent() { result = this }
939+
Content getAReadContent() {
940+
this = TSingletonContent(result)
941+
or
942+
// Wildcard expansion: a read at "any tuple element" matches a store at any
943+
// specific tuple index. (Stores always target a specific index, so we don't
944+
// need a `TupleElementAnyContent` Content kind here.)
945+
this = TAnyTupleElement() and result instanceof TupleElementContent
946+
or
947+
this = TAnyDictionaryElement() and
948+
(result instanceof DictionaryElementContent or result instanceof DictionaryElementAnyContent)
949+
}
913950

914951
/** Gets a textual representation of this content set. */
915-
string toString() { result = super.toString() }
952+
string toString() {
953+
exists(Content c | this = TSingletonContent(c) | result = c.toString())
954+
or
955+
this = TAnyTupleElement() and result = "Any tuple element"
956+
or
957+
this = TAnyDictionaryElement() and result = "Any dictionary element"
958+
}
916959
}
960+
961+
/** Gets the singleton `ContentSet` wrapping the `Content` `c`. */
962+
ContentSet singleton(Content c) { result = TSingletonContent(c) }

python/ql/lib/semmle/python/dataflow/new/internal/FlowSummaryImpl.qll

Lines changed: 26 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -66,21 +66,27 @@ module Input implements InputSig<Location, DataFlowImplSpecific::PythonDataFlow>
6666
}
6767

6868
string encodeContent(ContentSet cs, string arg) {
69-
cs = TListElementContent() and result = "ListElement" and arg = ""
70-
or
71-
cs = TSetElementContent() and result = "SetElement" and arg = ""
72-
or
73-
exists(int index |
74-
cs = TTupleElementContent(index) and result = "TupleElement" and arg = index.toString()
69+
exists(Content c | cs.isSingleton(c) |
70+
c = TListElementContent() and result = "ListElement" and arg = ""
71+
or
72+
c = TSetElementContent() and result = "SetElement" and arg = ""
73+
or
74+
exists(int index |
75+
c = TTupleElementContent(index) and result = "TupleElement" and arg = index.toString()
76+
)
77+
or
78+
exists(string key |
79+
c = TDictionaryElementContent(key) and result = "DictionaryElement" and arg = key
80+
)
81+
or
82+
c = TDictionaryElementAnyContent() and result = "DictionaryElementAny" and arg = ""
83+
or
84+
exists(string attr | c = TAttributeContent(attr) and result = "Attribute" and arg = attr)
7585
)
7686
or
77-
exists(string key |
78-
cs = TDictionaryElementContent(key) and result = "DictionaryElement" and arg = key
79-
)
87+
cs.isAnyTupleElement() and result = "AnyTupleElement" and arg = ""
8088
or
81-
cs = TDictionaryElementAnyContent() and result = "DictionaryElementAny" and arg = ""
82-
or
83-
exists(string attr | cs = TAttributeContent(attr) and result = "Attribute" and arg = attr)
89+
cs.isAnyDictionaryElement() and result = "AnyDictionaryElement" and arg = ""
8490
}
8591

8692
bindingset[token]
@@ -139,27 +145,29 @@ module Private {
139145
predicate withContent = SC::withContent/1;
140146

141147
/** Gets a summary component that represents a list element. */
142-
SummaryComponent listElement() { result = content(any(ListElementContent c)) }
148+
SummaryComponent listElement() { result = content(singleton(any(ListElementContent c))) }
143149

144150
/** Gets a summary component that represents a set element. */
145-
SummaryComponent setElement() { result = content(any(SetElementContent c)) }
151+
SummaryComponent setElement() { result = content(singleton(any(SetElementContent c))) }
146152

147153
/** Gets a summary component that represents a tuple element. */
148154
SummaryComponent tupleElement(int index) {
149-
exists(TupleElementContent c | c.getIndex() = index and result = content(c))
155+
exists(TupleElementContent c | c.getIndex() = index and result = content(singleton(c)))
150156
}
151157

152158
/** Gets a summary component that represents a dictionary element. */
153159
SummaryComponent dictionaryElement(string key) {
154-
exists(DictionaryElementContent c | c.getKey() = key and result = content(c))
160+
exists(DictionaryElementContent c | c.getKey() = key and result = content(singleton(c)))
155161
}
156162

157163
/** Gets a summary component that represents a dictionary element at any key. */
158-
SummaryComponent dictionaryElementAny() { result = content(any(DictionaryElementAnyContent c)) }
164+
SummaryComponent dictionaryElementAny() {
165+
result = content(singleton(any(DictionaryElementAnyContent c)))
166+
}
159167

160168
/** Gets a summary component that represents an attribute element. */
161169
SummaryComponent attribute(string attr) {
162-
exists(AttributeContent c | c.getAttribute() = attr and result = content(c))
170+
exists(AttributeContent c | c.getAttribute() = attr and result = content(singleton(c)))
163171
}
164172

165173
/** Gets a summary component that represents the return value of a call. */

python/ql/lib/semmle/python/dataflow/new/internal/TaintTrackingPrivate.qll

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,15 @@ predicate defaultTaintSanitizer(DataFlow::Node node) { none() }
1717
*/
1818
bindingset[node]
1919
predicate defaultImplicitTaintRead(DataFlow::Node node, DataFlow::ContentSet c) {
20-
// We allow implicit reads of precise content
21-
// imprecise content has already bubled up.
20+
// We allow implicit reads of precise content; imprecise content has already
21+
// bubbled up. We use the wildcard content sets here rather than the
22+
// per-key/per-index ones to avoid blowing up the size of `Stage1::readSetEx`
23+
// (otherwise this predicate would expand to one row per (node, distinct key
24+
// or index) and the framework's read-set relation grows quadratically).
25+
// `ContentSet.getAReadContent` expands these wildcards back to the specific
26+
// contents when matching against stores.
2227
exists(node) and
23-
(
24-
c instanceof DataFlow::TupleElementContent
25-
or
26-
c instanceof DataFlow::DictionaryElementContent
27-
)
28+
(c.isAnyTupleElement() or c.isAnyDictionaryElement())
2829
}
2930

3031
private module Cached {

python/ql/lib/semmle/python/dataflow/new/internal/TypeTrackingImpl.qll

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -241,7 +241,7 @@ module TypeTrackingInput implements Shared::TypeTrackingInput<Location> {
241241
// is only fed set/list content)
242242
not nodeFrom instanceof DataFlowPublic::IterableElementNode
243243
or
244-
TypeTrackerSummaryFlow::basicStoreStep(nodeFrom, nodeTo, content)
244+
TypeTrackerSummaryFlow::basicStoreStep(nodeFrom, nodeTo, DataFlowPublic::singleton(content))
245245
}
246246

247247
/**
@@ -272,14 +272,15 @@ module TypeTrackingInput implements Shared::TypeTrackingInput<Location> {
272272
nodeFrom.asCfgNode() instanceof SequenceNode
273273
)
274274
or
275-
TypeTrackerSummaryFlow::basicLoadStep(nodeFrom, nodeTo, content)
275+
TypeTrackerSummaryFlow::basicLoadStep(nodeFrom, nodeTo, DataFlowPublic::singleton(content))
276276
}
277277

278278
/**
279279
* Holds if the `loadContent` of `nodeFrom` is stored in the `storeContent` of `nodeTo`.
280280
*/
281281
predicate loadStoreStep(Node nodeFrom, Node nodeTo, Content loadContent, Content storeContent) {
282-
TypeTrackerSummaryFlow::basicLoadStoreStep(nodeFrom, nodeTo, loadContent, storeContent)
282+
TypeTrackerSummaryFlow::basicLoadStoreStep(nodeFrom, nodeTo,
283+
DataFlowPublic::singleton(loadContent), DataFlowPublic::singleton(storeContent))
283284
}
284285

285286
/**

python/ql/src/Variables/LoopVariableCapture/LoopVariableCaptureQuery.qll

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,13 @@ module EscapingCaptureFlowConfig implements DataFlow::ConfigSig {
6161
predicate allowImplicitRead(DataFlow::Node node, DataFlow::ContentSet cs) {
6262
isSink(node) and
6363
(
64-
cs.(DataFlow::TupleElementContent).getIndex() in [0 .. 10] or
65-
cs instanceof DataFlow::ListElementContent or
66-
cs instanceof DataFlow::SetElementContent or
67-
cs instanceof DataFlow::DictionaryElementAnyContent
64+
cs.isAnyTupleElement()
65+
or
66+
cs.isAnyDictionaryElement()
67+
or
68+
cs.getAStoreContent() instanceof DataFlow::ListElementContent
69+
or
70+
cs.getAStoreContent() instanceof DataFlow::SetElementContent
6871
)
6972
}
7073
}

0 commit comments

Comments
 (0)