From c4f04ff91ef6bfb048ffc1ad0d6fea252e4ab01c Mon Sep 17 00:00:00 2001
From: perseus <51974392+tcconnally@users.noreply.github.com>
Date: Mon, 15 Jun 2026 21:20:56 +0000
Subject: [PATCH 1/4] fix: ROUGE-1 eval fails for non-English languages
 (ASCII-only tokenizer)

The default RougeScorer tokenizer uses r'\\w+' regex which only matches
ASCII [a-zA-Z0-9_]. For non-Latin scripts (Thai, Chinese, Japanese,
etc.), this returns zero tokens, causing ROUGE scores of 0.0 even when
the response matches the expected output exactly.

Added _unicode_tokenize function that uses re.UNICODE flag and falls
back to character-level tokenization for non-ASCII scripts.

Closes #3111
---
 .../adk/evaluation/final_response_match_v1.py | 30 ++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
index 24b77da1499..999190095bd 100644
--- a/src/google/adk/evaluation/final_response_match_v1.py
+++ b/src/google/adk/evaluation/final_response_match_v1.py
@@ -92,6 +92,30 @@ def _get_eval_status(score: float, threshold: float):
   return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
 
 
+def _unicode_tokenize(text: str):
+  """Tokenizes text using Unicode-aware word boundaries.
+
+  The default RougeScorer tokenizer uses r'\\w+' which only matches ASCII
+  [a-zA-Z0-9_]. For non-Latin scripts (Thai, Chinese, Japanese, Arabic, etc.),
+  this returns zero tokens, causing ROUGE scores of 0.0 on matching responses.
+
+  This tokenizer uses re.UNICODE for ASCII-majority text and falls back to
+  character-level tokenization for non-ASCII text.
+  """
+  import re
+  # For primarily non-ASCII text, tokenize by Unicode-aware patterns
+  ascii_chars = sum(1 for c in text if ord(c) < 128)
+  if ascii_chars > len(text) * 0.5:
+    return re.findall(r'[\\w]+', text.lower(), re.UNICODE)
+  # For non-Latin scripts, use whitespace splitting with Unicode support
+  tokens = re.split(r'[\\s\\p{P}]+', text, flags=re.UNICODE)
+  tokens = [t.lower() for t in tokens if t]
+  if tokens:
+    return tokens
+  # Character-level fallback for scripts without word boundaries
+  return list(text.lower())
+
+
 def _calculate_rouge_1_scores(candidate: str, reference: str):
   """Calculates the ROUGE-1 score between a candidate and reference text.
 
@@ -110,7 +134,11 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
   Returns:
       A dictionary containing the ROUGE-1 precision, recall, and f-measure.
   """
-  scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True)
+  scorer = rouge_scorer.RougeScorer(
+        ["rouge1"],
+        use_stemmer=True,
+        tokenizer=_unicode_tokenize,
+    )
 
   # The score method returns a dictionary where keys are the ROUGE types
   # and values are Score objects (tuples) with precision, recall, and fmeasure.

From 5e7d5573c62032b29046188764e9406a92a762a6 Mon Sep 17 00:00:00 2001
From: perseus <51974392+tcconnally@users.noreply.github.com>
Date: Mon, 15 Jun 2026 19:52:27 -0500
Subject: [PATCH 2/4] fix: use proper tokenizer class for Unicode RougeScorer

- Replace function _unicode_tokenize with _UnicodeTokenizer class
  implementing the tokenize() method expected by RougeScorer
- Move import re to module level
- Fix double-escaped regex patterns (\w -> \w, remove unsupported \p{P})
- Add return type annotation for tokenize() to satisfy mypy strict mode
- Fix RougeScorer constructor indentation
---
 .../adk/evaluation/final_response_match_v1.py | 46 +++++++++----------
 1 file changed, 23 insertions(+), 23 deletions(-)

diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
index 999190095bd..4c92c0cff84 100644
--- a/src/google/adk/evaluation/final_response_match_v1.py
+++ b/src/google/adk/evaluation/final_response_match_v1.py
@@ -14,6 +14,7 @@
 
 from __future__ import annotations
 
+import re
 from typing import Optional
 
 from google.genai import types as genai_types
@@ -92,28 +93,27 @@ def _get_eval_status(score: float, threshold: float):
   return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED
 
 
-def _unicode_tokenize(text: str):
-  """Tokenizes text using Unicode-aware word boundaries.
+class _UnicodeTokenizer:
+  """Tokenizer that handles Unicode text with word-boundary awareness.
 
-  The default RougeScorer tokenizer uses r'\\w+' which only matches ASCII
-  [a-zA-Z0-9_]. For non-Latin scripts (Thai, Chinese, Japanese, Arabic, etc.),
-  this returns zero tokens, causing ROUGE scores of 0.0 on matching responses.
+  The default RougeScorer tokenizer splits on whitespace, which works for
+  ASCII and Latin-script text but produces zero tokens for text in scripts
+  without word boundaries (Chinese, Japanese, Thai, etc.).
 
-  This tokenizer uses re.UNICODE for ASCII-majority text and falls back to
-  character-level tokenization for non-ASCII text.
+  For ASCII-majority text this tokenizer uses Unicode-aware word-character
+  matching (``\\w+`` in re). For non-ASCII text it falls back to whitespace
+  splitting, then character-level tokenization.
   """
-  import re
-  # For primarily non-ASCII text, tokenize by Unicode-aware patterns
-  ascii_chars = sum(1 for c in text if ord(c) < 128)
-  if ascii_chars > len(text) * 0.5:
-    return re.findall(r'[\\w]+', text.lower(), re.UNICODE)
-  # For non-Latin scripts, use whitespace splitting with Unicode support
-  tokens = re.split(r'[\\s\\p{P}]+', text, flags=re.UNICODE)
-  tokens = [t.lower() for t in tokens if t]
-  if tokens:
-    return tokens
-  # Character-level fallback for scripts without word boundaries
-  return list(text.lower())
+
+  def tokenize(self, text: str) -> list[str]:
+    """Tokenizes text using Unicode-aware word boundaries."""
+    ascii_chars = sum(1 for c in text if ord(c) < 128)
+    if ascii_chars > len(text) * 0.5:
+      return re.findall(r'\w+', text.lower())
+    tokens = text.lower().split()
+    if tokens:
+      return tokens
+    return list(text.lower())
 
 
 def _calculate_rouge_1_scores(candidate: str, reference: str):
@@ -135,10 +135,10 @@ def _calculate_rouge_1_scores(candidate: str, reference: str):
       A dictionary containing the ROUGE-1 precision, recall, and f-measure.
   """
   scorer = rouge_scorer.RougeScorer(
-        ["rouge1"],
-        use_stemmer=True,
-        tokenizer=_unicode_tokenize,
-    )
+      ["rouge1"],
+      use_stemmer=True,
+      tokenizer=_UnicodeTokenizer(),
+  )
 
   # The score method returns a dictionary where keys are the ROUGE types
   # and values are Score objects (tuples) with precision, recall, and fmeasure.

From 98396a4de3e3ae59654aca92c10291ce19d09c77 Mon Sep 17 00:00:00 2001
From: perseus <51974392+tcconnally@users.noreply.github.com>
Date: Wed, 17 Jun 2026 18:40:42 +0000
Subject: [PATCH 3/4] chore: apply pyink formatting

---
 src/google/adk/evaluation/final_response_match_v1.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
index 4c92c0cff84..9131529b585 100644
--- a/src/google/adk/evaluation/final_response_match_v1.py
+++ b/src/google/adk/evaluation/final_response_match_v1.py
@@ -109,7 +109,7 @@ def tokenize(self, text: str) -> list[str]:
     """Tokenizes text using Unicode-aware word boundaries."""
     ascii_chars = sum(1 for c in text if ord(c) < 128)
     if ascii_chars > len(text) * 0.5:
-      return re.findall(r'\w+', text.lower())
+      return re.findall(r"\w+", text.lower())
     tokens = text.lower().split()
     if tokens:
       return tokens

From 01686748330e35bcf0221ab0e3c24cab35e1f389 Mon Sep 17 00:00:00 2001
From: Perseus Computing <51974392+tcconnally@users.noreply.github.com>
Date: Fri, 26 Jun 2026 10:44:48 -0500
Subject: [PATCH 4/4] fix: correct _UnicodeTokenizer for CJK segmentation and
 stemming
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The previous tokenizer had two defects:

- Its char-level fallback was unreachable: it split non-ASCII text on
  whitespace first, and scripts without spaces (Chinese, Japanese, Thai)
  yield a single token, so the `list(text)` fallback never ran. Two
  different CJK strings sharing characters scored 0.0 instead of getting
  partial credit.
- Passing a custom `tokenizer=` makes rouge-score ignore `use_stemmer`,
  so English stemming was silently dropped (e.g. "running" no longer
  matched "run").

Now ASCII-majority text is delegated to rouge-score's DefaultTokenizer
(preserving Porter stemming and existing behavior exactly), and non-ASCII
text keeps Latin/digit runs as words while splitting remaining word
characters individually so partial overlap is scored.

Verified: Thai exact=1.0, CJK exact=1.0, CJK partial(你好世界 vs 你好朋友)=0.5,
English stemming(running fast vs run fast)=1.0, Latin sanity matches default.
---
 .../adk/evaluation/final_response_match_v1.py | 29 +++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py
index 9131529b585..cb113e11bb3 100644
--- a/src/google/adk/evaluation/final_response_match_v1.py
+++ b/src/google/adk/evaluation/final_response_match_v1.py
@@ -96,24 +96,29 @@ def _get_eval_status(score: float, threshold: float):
 class _UnicodeTokenizer:
   """Tokenizer that handles Unicode text with word-boundary awareness.
 
-  The default RougeScorer tokenizer splits on whitespace, which works for
-  ASCII and Latin-script text but produces zero tokens for text in scripts
-  without word boundaries (Chinese, Japanese, Thai, etc.).
-
-  For ASCII-majority text this tokenizer uses Unicode-aware word-character
-  matching (``\\w+`` in re). For non-ASCII text it falls back to whitespace
-  splitting, then character-level tokenization.
+  The default RougeScorer tokenizer strips characters outside ``[a-z0-9]``, so
+  text in scripts without Latin word boundaries (Chinese, Japanese, Thai, etc.)
+  produces zero tokens and scores 0.0 even on an exact match.
+
+  ASCII-majority text is delegated to rouge-score's ``DefaultTokenizer`` so the
+  existing behavior -- including Porter stemming -- is preserved exactly. For
+  non-ASCII text, Latin/digit runs are kept as words and each remaining word
+  character (e.g. a CJK ideograph) becomes its own token, so partial overlap is
+  scored instead of collapsing into a single opaque token.
   """
 
+  def __init__(self, use_stemmer: bool = True):
+    self._default = rouge_scorer.tokenizers.DefaultTokenizer(use_stemmer)
+
   def tokenize(self, text: str) -> list[str]:
     """Tokenizes text using Unicode-aware word boundaries."""
+    text = text.lower()
+    if not text:
+      return []
     ascii_chars = sum(1 for c in text if ord(c) < 128)
     if ascii_chars > len(text) * 0.5:
-      return re.findall(r"\w+", text.lower())
-    tokens = text.lower().split()
-    if tokens:
-      return tokens
-    return list(text.lower())
+      return self._default.tokenize(text)
+    return re.findall(r"[a-z0-9]+|\w", text, re.UNICODE)
 
 
 def _calculate_rouge_1_scores(candidate: str, reference: str):