Fix GRU w8a32 operator (#17226)

mgiordy · web-flow · commit f1209c5986f5 · 2026-04-15T16:54:59.000Z
Differential Revision: D90437262 Pull Request resolved: #17226
diff --git a/backends/cadence/aot/ops_registrations.py b/backends/cadence/aot/ops_registrations.py
@@ -689,11 +689,11 @@ def register_fake(
 )
 
 lib.define(
-    "quantized_w8a32_gru(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale) -> Tensor"
+    "quantized_w8a32_gru(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_scale, Tensor bias_hidden) -> Tensor"
 )
 
 lib.define(
-    "quantized_w8a32_gru.out(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_i_scale, Tensor bias_hidden, float b_h_scale, *, Tensor(a!) out) -> Tensor(a!)"
+    "quantized_w8a32_gru.out(Tensor inputs, Tensor hidden, Tensor weights_inputs, float w_i_scale, Tensor weights_hidden, float w_h_scale, Tensor bias_inputs, float b_scale, Tensor bias_hidden, *, Tensor(a!) out) -> Tensor(a!)"
 )
 
 lib.define(
@@ -3060,11 +3060,20 @@ def quantized_w8a32_gru_meta(
     weights_hidden: torch.Tensor,
     w_h_scale: float,
     bias_inputs: torch.Tensor,
-    b_i_scale: float,
+    b_scale: float,
     bias_hidden: torch.Tensor,
-    b_h_scale: float,
 ) -> torch.Tensor:
-    return hidden.new_empty((2, *hidden.shape), dtype=torch.float32)
+    seq_len = inputs.shape[1]
+    assert seq_len == 1
+    # inputs comes in shape [batch, seq_len, input_size]
+    # hidden comes in shape [batch, seq_len, hidden_size]
+    # weights_inputs comes in shape [3 * hidden_size, input_size]
+    # weights_hidden comes in shape [3 * hidden_size, hidden_size]
+    # output comes in empty with shape [2, batch, seq_len, hidden_size]
+    # The first dimension stacks the output and the new hidden state
+    return hidden.new_empty(
+        (2, inputs.shape[0], inputs.shape[1], hidden.shape[-1]), dtype=torch.float32
+    )
 
 
 @register_fake("cadence::slice_scatter_")
diff --git a/backends/cadence/aot/quantizer/fusion_pass.py b/backends/cadence/aot/quantizer/fusion_pass.py
@@ -521,12 +521,10 @@ def get_args_and_kwargs_mixed_w8a32_gru(
 ) -> Tuple[Tuple[ArgsType, ...], Dict[str, ArgsType]]:
     # Stride, padding, dilation, groups not supported yet
 
-    assert len(dequants_weights) == 2
     assert len(dequants_biases) == 2
     w_i_scale = dequants_weights[0].args[1]
     w_h_scale = dequants_weights[1].args[1]
-    b_i_scale = dequants_biases[0].args[1]
-    b_h_scale = dequants_biases[1].args[1]
+    b_scale = dequants_biases[0].args[1]
 
     args = (
         other_inputs[0],
@@ -536,9 +534,8 @@ def get_args_and_kwargs_mixed_w8a32_gru(
         weights_inputs[1],
         w_h_scale,
         bias_inputs[0],
-        b_i_scale,
+        b_scale,
         bias_inputs[1],
-        b_h_scale,
     )
     kwargs = {}
 
diff --git a/backends/cadence/aot/quantizer/patterns.py b/backends/cadence/aot/quantizer/patterns.py
@@ -789,14 +789,16 @@ def get_anchors(
             )
 
         # Bail if input or states are not multiple of 4 (SIMD)
-        if gru_layer.args[0].meta["tensor_meta"].shape[-1] % 4 != 0:
+        tensor_meta_0 = gru_layer.args[0].meta.get("tensor_meta", None)
+        if tensor_meta_0 is None or tensor_meta_0.shape[-1] % 4 != 0:
             return (
                 PartitionAnchors(
                     empty=True,
                 ),
                 gru_layer,
             )
-        if gru_layer.args[1].meta["tensor_meta"].shape[-1] % 4 != 0:
+        tensor_meta_1 = gru_layer.args[1].meta.get("tensor_meta", None)
+        if tensor_meta_1 is None or tensor_meta_1.shape[-1] % 4 != 0:
             return (
                 PartitionAnchors(
                     empty=True,
@@ -811,13 +813,26 @@ def __init__(self, args, meta):
 
         wrapper = Wrapper(tuple(gru_layer.args[2]), gru_layer.meta)
 
+        # Using SharedQuantizationSpec so that bias_hh has the same observer as bias_ih
+        # Both biases get the same quantization scale to match the cpp operator
+        bias_ih_node = wrapper.args[2]
+        bias_ih_edge = (bias_ih_node, gru_layer)
+        shared_bias_qspec = SharedQuantizationSpec(edge_or_node=bias_ih_edge)
+
         return (
             PartitionAnchors(
                 inputs=[],
                 # pyre-fixme[6]: Expected `List[Tuple[Node, int]]` but got `List[Tuple[Wrapper, int]]`.
                 weights=[(wrapper, 0), (wrapper, 1)],
                 # pyre-fixme[6]: Expected `List[Union[Tuple[Node, int], Tuple[Node, int, DerivedQuantizationSpec]]]` but got `List[Tuple[Wrapper, int]]`.
-                biases=[(wrapper, 2), (wrapper, 3)],
+                biases=[
+                    (wrapper, 2),  # bias_ih gets normal qspec
+                    (
+                        wrapper,
+                        3,
+                        shared_bias_qspec,
+                    ),  # bias_hh shares observer with bias_ih
+                ],
                 output=[],
                 others=[(gru_layer, 0), (gru_layer, 1)],
             ),
diff --git a/backends/cadence/aot/ref_implementations.py b/backends/cadence/aot/ref_implementations.py
@@ -1257,9 +1257,8 @@ def quantized_w8a32_gru(
     weights_hidden: torch.Tensor,
     w_h_scale: float,
     bias_inputs: torch.Tensor,
-    b_i_scale: float,
+    b_scale: float,
     bias_hidden: torch.Tensor,
-    b_h_scale: float,
 ) -> torch.Tensor:
     assert weights_inputs.dtype == torch.int8
     assert weights_hidden.dtype == torch.int8
@@ -1288,10 +1287,8 @@ def quantized_w8a32_gru(
     dequant_weights_inputs = weights_inputs.float() * w_i_scale
     dequant_weights_hidden = weights_hidden.float() * w_h_scale
 
-    # C++ implementation averages the two bias scales
-    avg_bias_scale = (b_i_scale + b_h_scale) / 2
-    dequant_bias_inputs = bias_inputs.float() * avg_bias_scale
-    dequant_bias_hidden = bias_hidden.float() * avg_bias_scale
+    dequant_bias_inputs = bias_inputs.float() * b_scale
+    dequant_bias_hidden = bias_hidden.float() * b_scale
 
     gi = F.linear(inputs, dequant_weights_inputs, dequant_bias_inputs)
     gh = F.linear(hidden, dequant_weights_hidden, dequant_bias_hidden)
@@ -1310,8 +1307,14 @@ def quantized_w8a32_gru(
 
     assert new_hidden.shape == original_hidden_shape
 
-    new_hidden = new_hidden.view(original_hidden_shape)
-    return torch.stack([new_hidden, new_hidden], dim=0)
+    batch_size = inputs.shape[0]
+    input_dim = inputs.shape[1]
+    hidden_dim = hidden.shape[-1]
+
+    new_hidden_expanded = new_hidden.unsqueeze(1).expand(
+        batch_size, input_dim, hidden_dim
+    )
+    return torch.stack([new_hidden_expanded, new_hidden_expanded], dim=0)
 
 
 @impl_tracked(m, "quantized_conv2d_nhwc.per_tensor")
diff --git a/backends/cadence/aot/tests/test_ref_implementations.py b/backends/cadence/aot/tests/test_ref_implementations.py
@@ -2901,9 +2901,8 @@ def test_softmax_f32_f32(self) -> None:
                 torch.ones((12, 4), dtype=torch.int8),  # weights_hidden: 12x4 (3*4 x 4)
                 0.1,  # w_h_scale
                 torch.zeros(12, dtype=torch.int8),  # bias_inputs: 12
-                0.1,  # b_i_scale
+                0.1,  # b_scale
                 torch.zeros(12, dtype=torch.int8),  # bias_hidden: 12
-                0.1,  # b_h_scale
             ),
             (
                 "invalid_batch_size_2",
@@ -2918,9 +2917,8 @@ def test_softmax_f32_f32(self) -> None:
                 torch.ones((12, 4), dtype=torch.int8),  # weights_hidden: 12x4
                 0.1,  # w_h_scale
                 torch.zeros(12, dtype=torch.int8),  # bias_inputs: 12
-                0.1,  # b_i_scale
+                0.1,  # b_scale
                 torch.zeros(12, dtype=torch.int8),  # bias_hidden: 12
-                0.1,  # b_h_scale
             ),
             (
                 "non_zero_biases",
@@ -2933,11 +2931,10 @@ def test_softmax_f32_f32(self) -> None:
                 torch.tensor(
                     [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], dtype=torch.int8
                 ),  # bias_inputs: 12
-                0.1,  # b_i_scale
+                0.1,  # b_scale
                 torch.tensor(
                     [1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3], dtype=torch.int8
                 ),  # bias_hidden: 12
-                0.1,  # b_h_scale
             ),
             (
                 "negative_weights",
@@ -2954,9 +2951,8 @@ def test_softmax_f32_f32(self) -> None:
                 ),  # weights_hidden: 12x4 (alternating pattern)
                 0.1,  # w_h_scale
                 torch.zeros(12, dtype=torch.int8),  # bias_inputs: 12
-                0.1,  # b_i_scale
+                0.1,  # b_scale
                 torch.zeros(12, dtype=torch.int8),  # bias_hidden: 12
-                0.1,  # b_h_scale
             ),
             (
                 "hidden_dim_8",
@@ -2969,9 +2965,8 @@ def test_softmax_f32_f32(self) -> None:
                 torch.ones((24, 8), dtype=torch.int8),  # weights_hidden: 24x8 (3*8 x 8)
                 0.1,  # w_h_scale
                 torch.zeros(24, dtype=torch.int8),  # bias_inputs: 24
-                0.1,  # b_i_scale
+                0.1,  # b_scale
                 torch.zeros(24, dtype=torch.int8),  # bias_hidden: 24
-                0.1,  # b_h_scale
             ),
         ]
     )
@@ -2985,9 +2980,8 @@ def test_quantized_w8a32_gru(
         weights_hidden: torch.Tensor,
         w_h_scale: float,
         bias_inputs: torch.Tensor,
-        b_i_scale: float,
+        b_scale: float,
         bias_hidden: torch.Tensor,
-        b_h_scale: float,
     ) -> None:
 
         if name == "invalid_batch_size_2":
@@ -3000,9 +2994,8 @@ def test_quantized_w8a32_gru(
                     weights_hidden,
                     w_h_scale,
                     bias_inputs,
-                    b_i_scale,
+                    b_scale,
                     bias_hidden,
-                    b_h_scale,
                 )
             self.assertIn(
                 "Leading dimension 0 of hidden state must be 1", str(context.exception)
@@ -3017,9 +3010,8 @@ def test_quantized_w8a32_gru(
             weights_hidden,
             w_h_scale,
             bias_inputs,
-            b_i_scale,
+            b_scale,
             bias_hidden,
-            b_h_scale,
         )
 
         # Verify output properties
@@ -3028,10 +3020,11 @@ def test_quantized_w8a32_gru(
             torch.float32,
             f"Output dtype should be float32 in {name}",
         )
+        expected_shape = (2, inputs.shape[0], inputs.shape[1], hidden.shape[-1])
         self.assertEqual(
             output.shape,
-            (2, *hidden.shape),
-            f"Output shape should match {(2, *hidden.shape)} in {name}",
+            expected_shape,
+            f"Output shape should match {expected_shape} in {name}",
         )
         assert isinstance(output, torch.Tensor)
 
@@ -3064,7 +3057,6 @@ def test_quantized_w8a32_gru_invalid_hidden_dim(self) -> None:
                 bias_inputs,
                 0.1,
                 bias_hidden,
-                0.1,
             )
 
         self.assertIn(