Azure-Samples
diff --git a/‎docs/evaluation.md‎
Lines changed: 50 additions & 0 deletions b/‎docs/evaluation.md‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎evals/README.md‎
Lines changed: 8 additions & 2 deletions b/‎evals/README.md‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎evals/eval_config.json‎
Lines changed: 16 additions & 0 deletions b/‎evals/eval_config.json‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎evals/evaluate.py‎
Lines changed: 48 additions & 0 deletions b/‎evals/evaluate.py‎
Lines changed: 48 additions & 0 deletions
diff --git a/‎evals/generate_ground_truth.py‎
Lines changed: 17 additions & 17 deletions b/‎evals/generate_ground_truth.py‎
Lines changed: 17 additions & 17 deletions
diff --git a/‎evals/ground_truth.json‎
Lines changed: 0 additions & 10 deletions b/‎evals/ground_truth.json‎
Lines changed: 0 additions & 10 deletions
@@ -0,0 +1,50 @@
+# Evaluating the RAG answer quality
+
+Install all the dependencies for the evaluation script by running the following command:
+
+```bash
+pip install -r requirements-dev.txt
+```
+
+## Generate ground truth data
+
+Generate ground truth data by running the following command:
+
+```bash
+python evals/generate.py
+```
+
+Review the generated data after running that script, removing any question/answer pairs that don't seem like realistic user input.
+
+## Evaluate the RAG answer quality
+
+Review the configuration in `evals/eval_config.json` to ensure that everything is correctly setup. You may want to adjust the metrics used. [TODO: link to evaluator docs]
+
+By default, the evaluation script will evaluate every question in the ground truth data.
+Run the evaluation script by running the following command:
+
+```bash
+python evals/evaluate.py
+```
+
+## Review the evaluation results
+
+The evaluation script will output a summary of the evaluation results, inside the `evals/results` directory.
+
+You can see a summary of results across all evaluation runs by running the following command:
+
+```bash
+python -m evaltools summary evals/results
+```
+
+Compare answers across runs by running the following command:
+
+```bash
+python -m evaltools diff evals/results/baseline/
+```
+
+## Run the evaluation in GitHub actions
+
+
+# TODO: Add GPT-4 deployment with high capacity for evaluation
+# TODO: Add CI workflow that can be triggered to run the evaluate on the local app
@@ -1,2 +1,8 @@
-pip install git+https://github.com/Azure-Samples/ai-rag-chat-evaluator/@installable
-pip install psycopg2
+pip install -r requirements-dev.txt
+
+python evals/generate.py
+
+python evals/evaluate.py
+
+# TODO: Add GPT-4 deployment with high capacity for evaluation
+# TODO: Add CI workflow that can be triggered to run the evaluate on the local app
@@ -0,0 +1,16 @@
+{
+    "testdata_path": "ground_truth.jsonl",
+    "results_dir": "results/experiment<TIMESTAMP>",
+    "requested_metrics": ["gpt_groundedness", "gpt_relevance", "answer_length", "latency", "citation_match"],
+    "target_url": "http://host.docker.internal:8000/chat",
+    "target_parameters": {
+        "overrides": {
+            "use_advanced_flow": true,
+            "top": 3,
+            "retrieval_mode": "hybrid",
+            "temperature": 0.3
+        }
+    },
+    "target_response_answer_jmespath": "message.content",
+    "target_response_context_jmespath": "context.data_points"
+}
@@ -0,0 +1,48 @@
+import logging
+import os
+from pathlib import Path
+
+from dotenv import load_dotenv
+from evaltools.eval.evaluate import run_evaluate_from_config
+from promptflow.core import AzureOpenAIModelConfiguration, ModelConfiguration, OpenAIModelConfiguration
+
+logger = logging.getLogger("ragapp")
+
+
+def get_openai_config() -> ModelConfiguration:
+    if os.environ.get("OPENAI_CHAT_HOST") == "azure":
+        azure_endpoint = os.environ["AZURE_OPENAI_ENDPOINT"]
+        azure_deployment = os.environ.get("AZURE_OPENAI_CHAT_DEPLOYMENT")
+        api_version = "2023-07-01-preview"
+        if os.environ.get("AZURE_OPENAI_KEY"):
+            logger.info("Using Azure OpenAI Service with API Key from AZURE_OPENAI_KEY")
+            openai_config = AzureOpenAIModelConfiguration(
+                azure_endpoint=azure_endpoint,
+                azure_deployment=azure_deployment,
+                api_version=api_version,
+                api_key=os.environ["AZURE_OPENAI_KEY"],
+            )
+        else:
+            logger.info("Using Azure OpenAI Service with Azure Developer CLI Credential")
+            openai_config = AzureOpenAIModelConfiguration(
+                azure_endpoint=azure_endpoint, azure_deployment=azure_deployment, api_version=api_version
+            )
+            # PromptFlow will call DefaultAzureCredential behind the scenes
+        openai_config.model = os.environ["AZURE_OPENAI_CHAT_DEPLOYMENT"]
+    else:
+        logger.info("Using OpenAI Service with API Key from OPENAICOM_KEY")
+        openai_config = OpenAIModelConfiguration(
+            model=os.environ["OPENAICOM_CHAT_MODEL"], api_key=os.environ.get("OPENAICOM_KEY")
+        )
+    return openai_config
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.WARNING)
+    logger.setLevel(logging.INFO)
+    load_dotenv(".env", override=True)
+
+    openai_config = get_openai_config()
+    run_evaluate_from_config(
+        working_dir=Path(__file__).parent, config_path="eval_config.json", openai_config=openai_config, num_questions=20
+    )
@@ -27,24 +27,24 @@ def source_retriever() -> Generator[dict, None, None]:
         item_types = session.scalars(select(Item.type).distinct())
         for item_type in item_types:
             records = list(session.scalars(select(Item).filter(Item.type == item_type).order_by(Item.id)))
-            logger.info(f"Processing database records for type: {item_type}")
-            yield {
-                "citations": " ".join([f"[{record.id}] - {record.name}" for record in records]),
-                "content": "\n\n".join([record.to_str_for_rag() for record in records]),
-            }
+            # logger.info(f"Processing database records for type: {item_type}")
+            # yield {
+            #    "citations": " ".join([f"[{record.id}] - {record.name}" for record in records]),
+            #    "content": "\n\n".join([record.to_str_for_rag() for record in records]),
+            # }
         # Fetch each item individually
-        # records = session.scalars(select(Item).order_by(Item.id))
-        # for record in records:
-        #    logger.info(f"Processing database record: {record.name}")
-        #    yield {"id": [record.id], "content": record.to_str_for_rag()}
+        records = session.scalars(select(Item).order_by(Item.id))
+        for record in records:
+            logger.info(f"Processing database record: {record.name}")
+            yield {"id": record.id, "content": record.to_str_for_rag()}
 
 
 def source_to_text(source) -> str:
     return source["content"]
 
 
 def answer_formatter(answer, source) -> str:
-    return f"{answer} {source['citations']}"
+    return f"{answer} [{source['id']}]"
 
 
 def get_openai_config_dict() -> dict:
@@ -85,11 +85,11 @@ def get_openai_config_dict() -> dict:
     load_dotenv(".env", override=True)
 
     generate_test_qa_data(
-        get_openai_config_dict(),
-        10,
-        5,
-        Path(__file__).parent / "ground_truth.json",
-        source_retriever,
-        source_to_text,
-        answer_formatter,
+        openai_config=get_openai_config_dict(),
+        num_questions_total=202,
+        num_questions_per_source=2,
+        output_file=Path(__file__).parent / "ground_truth.jsonl",
+        source_retriever=source_retriever,
+        source_to_text=source_to_text,
+        answer_formatter=answer_formatter,
     )