QPR-12225 -- Extend regression test JSON comparison to allow for differently ordered results

damienbarker · jenkins · commit 5262c1f1ad35 · 2024-03-27T17:03:17.000Z
diff --git a/Tools/PythonTools/Readme.md b/Tools/PythonTools/Readme.md
@@ -64,6 +64,16 @@ Each object under a given key, `file_name`, has the following format (note that
             "key2",
             "key3/subkey1"
           ],
+          "keys": {
+            "key1/": [
+              "subkey1",
+              "subkey2"
+            ],
+            "": [
+              "subkey1",
+              "subkey2"
+            ]
+          },
           "settings": [
             {
               "names": [
@@ -100,6 +110,7 @@ For `csv_settings`:
 For `json_settings`:
   - Any key value (i.e. in `ignore_keys`, `settings.names`, etc.) must include the parent, if any. Using the sample comparison_config.json template above, we would ignore "key1" and "key2" at the top level in a JSON file comparison, and "subkey1" only if it appears inside of "key3".
   - The `ignore_keys` is an array of strings, each string being a key in the JSON file. If the key is found in one or both of the files, any diffs will be ignored for this key and its children (i.e. if the value is itself a nested object).
+  - The `keys` is a dictionary whose keys consist of paths, and the values **must consist of** arrays. This will allow for a fallback comparison using `datacompy.core.Compare()` (which is already used for CSV comparisons) and will allow a JSON diff to still pass if the only reason for the diff is that the results are not given in the same order. A blank key ("") means that we expect the whole JSON file consists of an array of unnested JSON objects (and is usually a CSV report that was converted directly to a JSON report). Hence, it would never make sense to have a blank ("") key along with other non-blank keys for the same file comparison config. **NOTE:** A non-blank key must end in a "/".
   - The `settings` object works the same as the `column_settings` file in `csv_settings`, except that the keys must include the parent in the JSON `settings`.
   - **NOTE:** In order for a JSON check to be applied, a comp config must be provided for the filename, even if the config is empty (see e.g. `all_string_filename` in the template above). Otherwise a direct file comparison will be done.
   - **NOTE:** String diffs are automatically processed (i.e. unless they are in `ignore_keys`, then a string diff will be a failing diff) (see e.g. `all_string_filename` in the template above). Only numerical differences need to be handled in `settings`.
diff --git a/Tools/PythonTools/compare_files.py b/Tools/PythonTools/compare_files.py
@@ -1,4 +1,5 @@
 # Function for comparing two files.
+from pprint import pprint
 import os
 import argparse
 import collections
@@ -414,7 +415,7 @@ def compare_files_df(name, file_1, file_2, config):
                         logger.warning('Failing test, because require_equal_optional_cols is true')
                         return False
                     else:
-                        logger.warning('Ignore unequal optional cols, because because require_equal_optional_cols is false')
+                        logger.warning('Ignore unequal optional cols, because require_equal_optional_cols is false')
                 else:
                     logger.warning('Failing test, because require_equal_optional_cols is not given, defaults to true')
                     return False
@@ -594,7 +595,7 @@ def compare_files_json(name, file_1, file_2, config) -> bool:
     json_diff = jsondiff.diff(json_1, json_2, syntax='symmetric', marshal=True)
 
     # Filter out differences that can be ignored, or are within tolerances
-    validate_json_diff(json_diff, config, '')
+    validate_json_diff(json_1, json_2, json_diff, config, '')
 
     if json_diff:
         if isinstance(json_diff, dict):
@@ -607,7 +608,7 @@ def compare_files_json(name, file_1, file_2, config) -> bool:
         return True
 
 # Modifies jsondif.diff output so that 'ignoreable' diffs and diffs within tolerance/s are removed
-def validate_json_diff(json_diff: dict, config: dict, path: str) -> None:
+def validate_json_diff(json_1, json_2, json_diff: dict, config: dict, path: str) -> None:
     logger = logging.getLogger(__name__)
 
     if not json_diff:
@@ -620,7 +621,7 @@ def validate_json_diff(json_diff: dict, config: dict, path: str) -> None:
     # If the diff obj is a set of diffs between two arrays (denoted by dict with int keys)
     if all([isinstance(k, int) for k in json_diff.keys()]):
         for diff in json_diff.values():
-            validate_json_diff(diff, config, path)
+            validate_json_diff(json_1, json_2, diff, config, path)
 
     # If the diff obj is a dict of diffs between two objects
     else:
@@ -634,18 +635,19 @@ def validate_json_diff(json_diff: dict, config: dict, path: str) -> None:
                 del json_diff[key_to_check]
 
         for key, diff in json_diff.items():
+            # Insertions (key="$insert") and deletions (key="$delete") should count as a test failure
+            if "$" in key:
+                continue
+
             if isinstance(diff, dict):
                 new_path = ((path + str(key)) if path else str(key)) + '/'
-                validate_json_diff(diff, config, new_path)
+                validate_json_diff(json_1, json_2, diff, config, new_path)
 
             else:
-                # Insertions (key="$insert") and deletions (key="$delete") should count as a test failure
-                if "$" in key:
-                    continue
-
                 # At this point, we expect diff to be a list
                 if not isinstance(diff, list):
                     logger.warning(f"diff has invalid type: {type(diff)=}")
+                    logger.warning(f"{diff=}")
                     continue
                 # with 2 elements
                 if not (len(diff) == 2):
@@ -679,6 +681,46 @@ def validate_json_diff(json_diff: dict, config: dict, path: str) -> None:
                                     if abs_check or rel_check:
                                         diff.clear()
 
+                # Fallback comparison in the case of JSON array diffs due to differently ordering.
+                if diff:
+                    keys = config.get("keys")
+                    if keys is not None and path in keys:
+                        # Only compare on columns defined in "settings"
+                        names_to_check = [n.replace(path, '') for n in names if n.startswith(path)]
+
+                        logger.warning(f"jsondiff list comparison failed for {path=}. Falling back to datacompy.core.Compare()")
+                        path_tokens = [tok for tok in path.split('/') if tok]
+
+                        # Get the original JSON objs
+                        json_1_ptr = json_1
+                        json_2_ptr = json_2
+
+                        # Get the list/array within the JSON objs that failed the comparison
+                        if path_tokens:
+                            json_1_ptr = json_1_ptr[0]
+                            json_2_ptr = json_2_ptr[0]
+                            for pt in path_tokens:
+                                json_1_ptr = json_1_ptr[pt]
+                                json_2_ptr = json_2_ptr[pt]
+
+                        # Convert these lists (we are implicitly assuming they are lists) to DataFrame for datacompy comparison, similar to the CSV reports
+                        json_1_df = pd.DataFrame(json_1_ptr)
+                        # json_1_df = json_1_df[[header for header in json_1_df.columns if header in names_to_check + keys[path]]]
+                        json_2_df = pd.DataFrame(json_2_ptr)
+                        # json_2_df = json_2_df[[header for header in json_2_df.columns if header in names_to_check + keys[path]]]
+
+                        # Run comparison
+                        comp = Compare(json_1_df, json_2_df, join_columns=keys[path], abs_tol=abs_tol, rel_tol=rel_tol,
+                               df1_name='expected', df2_name='calculated')
+
+                        if comp.matches():
+                            diff.clear()
+                        else:
+                            print(json_1_df)
+                            print(json_2_df)
+                            logger.warning("Fallback comparison failed.")
+                            logger.warning(comp.report())
+
     # For the diffs that are now empty, we can remove them from the dict
     diffs_to_ignore = []
     for key, diff in json_diff.items():
diff --git a/Tools/PythonTools/comparison_config.json b/Tools/PythonTools/comparison_config.json
@@ -1424,6 +1424,11 @@
         ]
       },
       "curves\\.json": {
+        "keys": {
+          "data/": [
+            "dates"
+          ]
+        },
         "settings": [
           {
             "names": [
@@ -1440,6 +1445,16 @@
         "ignore_keys": [
           "simmResultsPath"
         ],
+        "keys": {
+          "": [
+            "portfolio",
+            "product_class",
+            "risk_class",
+            "margin_type",
+            "bucket",
+            "side"
+          ]
+        },
         "settings": [
           {
             "names": [
@@ -1454,6 +1469,66 @@
         ]
       },
       "im_impact\\.json": {
+        "keys": {
+          "impact/schedule_impact_report/": [
+            "portfolio",
+            "agreement_type",
+            "product_class",
+            "call_type",
+            "initial_margin_type",
+            "legal_entity_id",
+            "side"
+          ],
+          "impact/simm_impact_report/": [
+            "portfolio",
+            "agreement_type",
+            "call_type",
+            "initial_margin_type",
+            "legal_entity_id",
+            "product_class",
+            "risk_class",
+            "margin_type",
+            "bucket",
+            "side"
+          ],
+          "impact/total_impact_report/": [
+            "portfolio",
+            "agreement_type",
+            "call_type",
+            "initial_margin_type",
+            "legal_entity_id",
+            "side"
+          ],
+          "impact/standalone_im_schedule_report/": [
+            "portfolio",
+            "agreement_type",
+            "call_type",
+            "initial_margin_type",
+            "legal_entity_id",
+            "product_class",
+            "side"
+          ],
+          "impact/standalone_simm_report/": [
+            "portfolio",
+            "agreement_type",
+            "call_type",
+            "initial_margin_type",
+            "legal_entity_id",
+            "product_class",
+            "risk_class",
+            "margin_type",
+            "bucket",
+            "side"
+          ],
+          "impact/standalone_total_im_report/": [
+            "portfolio",
+            "agreement_type",
+            "call_type",
+            "initial_margin_type",
+            "legal_entity_id",
+            "side"
+          ]
+        },
         "settings": [
           {
             "names": [
@@ -1465,6 +1540,7 @@
               "impact/simm_impact_report/initial_margin_impact",
               "impact/standalone_im_schedule_report/gross_im",
               "impact/standalone_im_schedule_report/schedule_im",
+              "impact/standalone_im_schedule_report/gross_current_rc",
               "impact/standalone_im_schedule_report/net_current_rc",
               "impact/standalone_im_schedule_report/net_to_gross_ratio",
               "impact/standalone_simm_report/initial_margin",
@@ -1491,6 +1567,14 @@
         ]
       },
       "frtb\\.json": {
+        "keys": {
+          "": [
+            "bucket",
+            "capitalRequirement",
+            "risk_class",
+            "correlationScenario"
+          ]
+        },
         "settings": [
           {
             "names": [
@@ -1502,6 +1586,12 @@
         ]
       },
       "total_im\\.json": {
+        "keys": {
+          "": [
+            "side",
+            "portfolio"
+          ]
+        },
         "settings": [
           {
             "names": [
@@ -1517,9 +1607,98 @@
         "ignore_keys": [
           "simmResultsPath"
         ],
+        "keys": {
+          "additionalResults/": [
+            "tradeId",
+            "resultId",
+            "resultType"
+          ],
+          "bacva/": [
+            "analytic",
+            "counterparty",
+            "nettingSetId"
+          ],
+          "frtb/": [
+            "bucket",
+            "capitalRequirement",
+            "risk_class",
+            "correlationScenario"
+          ],
+          "impact/schedule_impact_report/": [
+            "portfolio",
+            "agreement_type",
+            "product_class",
+            "call_type",
+            "initial_margin_type",
+            "legal_entity_id",
+            "side"
+          ],
+          "impact/simm_impact_report/": [
+            "portfolio",
+            "agreement_type",
+            "call_type",
+            "initial_margin_type",
+            "legal_entity_id",
+            "product_class",
+            "risk_class",
+            "margin_type",
+            "bucket",
+            "side"
+          ],
+          "impact/total_impact_report/": [
+            "portfolio",
+            "agreement_type",
+            "call_type",
+            "initial_margin_type",
+            "legal_entity_id",
+            "side"
+          ],
+          "impact/standalone_im_schedule_report/": [
+            "portfolio",
+            "agreement_type",
+            "call_type",
+            "initial_margin_type",
+            "legal_entity_id",
+            "product_class",
+            "side"
+          ],
+          "impact/standalone_simm_report/": [
+            "portfolio",
+            "agreement_type",
+            "call_type",
+            "initial_margin_type",
+            "legal_entity_id",
+            "product_class",
+            "risk_class",
+            "margin_type",
+            "bucket",
+            "side"
+          ],
+          "impact/standalone_total_im_report/": [
+            "portfolio",
+            "agreement_type",
+            "call_type",
+            "initial_margin_type",
+            "legal_entity_id",
+            "side"
+          ],
+          "simmReport/": [
+            "portfolio",
+            "product_class",
+            "risk_class",
+            "margin_type",
+            "bucket",
+            "side"
+          ],
+          "totalIMReport/": [
+            "side",
+            "portfolio"
+          ]
+        },
         "settings": [
           {
             "names": [
+              "additionalResults/resultValue",
               "bacva/value",
               "cashflow/accrual",
               "cashflow/accruedAmount",