From c4f04ff91ef6bfb048ffc1ad0d6fea252e4ab01c Mon Sep 17 00:00:00 2001 From: perseus <51974392+tcconnally@users.noreply.github.com> Date: Mon, 15 Jun 2026 21:20:56 +0000 Subject: [PATCH 1/4] fix: ROUGE-1 eval fails for non-English languages (ASCII-only tokenizer) The default RougeScorer tokenizer uses r'\\w+' regex which only matches ASCII [a-zA-Z0-9_]. For non-Latin scripts (Thai, Chinese, Japanese, etc.), this returns zero tokens, causing ROUGE scores of 0.0 even when the response matches the expected output exactly. Added _unicode_tokenize function that uses re.UNICODE flag and falls back to character-level tokenization for non-ASCII scripts. Closes #3111 --- .../adk/evaluation/final_response_match_v1.py | 30 ++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index 24b77da1499..999190095bd 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -92,6 +92,30 @@ def _get_eval_status(score: float, threshold: float): return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED +def _unicode_tokenize(text: str): + """Tokenizes text using Unicode-aware word boundaries. + + The default RougeScorer tokenizer uses r'\\w+' which only matches ASCII + [a-zA-Z0-9_]. For non-Latin scripts (Thai, Chinese, Japanese, Arabic, etc.), + this returns zero tokens, causing ROUGE scores of 0.0 on matching responses. + + This tokenizer uses re.UNICODE for ASCII-majority text and falls back to + character-level tokenization for non-ASCII text. + """ + import re + # For primarily non-ASCII text, tokenize by Unicode-aware patterns + ascii_chars = sum(1 for c in text if ord(c) < 128) + if ascii_chars > len(text) * 0.5: + return re.findall(r'[\\w]+', text.lower(), re.UNICODE) + # For non-Latin scripts, use whitespace splitting with Unicode support + tokens = re.split(r'[\\s\\p{P}]+', text, flags=re.UNICODE) + tokens = [t.lower() for t in tokens if t] + if tokens: + return tokens + # Character-level fallback for scripts without word boundaries + return list(text.lower()) + + def _calculate_rouge_1_scores(candidate: str, reference: str): """Calculates the ROUGE-1 score between a candidate and reference text. @@ -110,7 +134,11 @@ def _calculate_rouge_1_scores(candidate: str, reference: str): Returns: A dictionary containing the ROUGE-1 precision, recall, and f-measure. """ - scorer = rouge_scorer.RougeScorer(["rouge1"], use_stemmer=True) + scorer = rouge_scorer.RougeScorer( + ["rouge1"], + use_stemmer=True, + tokenizer=_unicode_tokenize, + ) # The score method returns a dictionary where keys are the ROUGE types # and values are Score objects (tuples) with precision, recall, and fmeasure. From 5e7d5573c62032b29046188764e9406a92a762a6 Mon Sep 17 00:00:00 2001 From: perseus <51974392+tcconnally@users.noreply.github.com> Date: Mon, 15 Jun 2026 19:52:27 -0500 Subject: [PATCH 2/4] fix: use proper tokenizer class for Unicode RougeScorer - Replace function _unicode_tokenize with _UnicodeTokenizer class implementing the tokenize() method expected by RougeScorer - Move import re to module level - Fix double-escaped regex patterns (\w -> \w, remove unsupported \p{P}) - Add return type annotation for tokenize() to satisfy mypy strict mode - Fix RougeScorer constructor indentation --- .../adk/evaluation/final_response_match_v1.py | 46 +++++++++---------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index 999190095bd..4c92c0cff84 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -14,6 +14,7 @@ from __future__ import annotations +import re from typing import Optional from google.genai import types as genai_types @@ -92,28 +93,27 @@ def _get_eval_status(score: float, threshold: float): return EvalStatus.PASSED if score >= threshold else EvalStatus.FAILED -def _unicode_tokenize(text: str): - """Tokenizes text using Unicode-aware word boundaries. +class _UnicodeTokenizer: + """Tokenizer that handles Unicode text with word-boundary awareness. - The default RougeScorer tokenizer uses r'\\w+' which only matches ASCII - [a-zA-Z0-9_]. For non-Latin scripts (Thai, Chinese, Japanese, Arabic, etc.), - this returns zero tokens, causing ROUGE scores of 0.0 on matching responses. + The default RougeScorer tokenizer splits on whitespace, which works for + ASCII and Latin-script text but produces zero tokens for text in scripts + without word boundaries (Chinese, Japanese, Thai, etc.). - This tokenizer uses re.UNICODE for ASCII-majority text and falls back to - character-level tokenization for non-ASCII text. + For ASCII-majority text this tokenizer uses Unicode-aware word-character + matching (``\\w+`` in re). For non-ASCII text it falls back to whitespace + splitting, then character-level tokenization. """ - import re - # For primarily non-ASCII text, tokenize by Unicode-aware patterns - ascii_chars = sum(1 for c in text if ord(c) < 128) - if ascii_chars > len(text) * 0.5: - return re.findall(r'[\\w]+', text.lower(), re.UNICODE) - # For non-Latin scripts, use whitespace splitting with Unicode support - tokens = re.split(r'[\\s\\p{P}]+', text, flags=re.UNICODE) - tokens = [t.lower() for t in tokens if t] - if tokens: - return tokens - # Character-level fallback for scripts without word boundaries - return list(text.lower()) + + def tokenize(self, text: str) -> list[str]: + """Tokenizes text using Unicode-aware word boundaries.""" + ascii_chars = sum(1 for c in text if ord(c) < 128) + if ascii_chars > len(text) * 0.5: + return re.findall(r'\w+', text.lower()) + tokens = text.lower().split() + if tokens: + return tokens + return list(text.lower()) def _calculate_rouge_1_scores(candidate: str, reference: str): @@ -135,10 +135,10 @@ def _calculate_rouge_1_scores(candidate: str, reference: str): A dictionary containing the ROUGE-1 precision, recall, and f-measure. """ scorer = rouge_scorer.RougeScorer( - ["rouge1"], - use_stemmer=True, - tokenizer=_unicode_tokenize, - ) + ["rouge1"], + use_stemmer=True, + tokenizer=_UnicodeTokenizer(), + ) # The score method returns a dictionary where keys are the ROUGE types # and values are Score objects (tuples) with precision, recall, and fmeasure. From 98396a4de3e3ae59654aca92c10291ce19d09c77 Mon Sep 17 00:00:00 2001 From: perseus <51974392+tcconnally@users.noreply.github.com> Date: Wed, 17 Jun 2026 18:40:42 +0000 Subject: [PATCH 3/4] chore: apply pyink formatting --- src/google/adk/evaluation/final_response_match_v1.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index 4c92c0cff84..9131529b585 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -109,7 +109,7 @@ def tokenize(self, text: str) -> list[str]: """Tokenizes text using Unicode-aware word boundaries.""" ascii_chars = sum(1 for c in text if ord(c) < 128) if ascii_chars > len(text) * 0.5: - return re.findall(r'\w+', text.lower()) + return re.findall(r"\w+", text.lower()) tokens = text.lower().split() if tokens: return tokens From 01686748330e35bcf0221ab0e3c24cab35e1f389 Mon Sep 17 00:00:00 2001 From: Perseus Computing <51974392+tcconnally@users.noreply.github.com> Date: Fri, 26 Jun 2026 10:44:48 -0500 Subject: [PATCH 4/4] fix: correct _UnicodeTokenizer for CJK segmentation and stemming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The previous tokenizer had two defects: - Its char-level fallback was unreachable: it split non-ASCII text on whitespace first, and scripts without spaces (Chinese, Japanese, Thai) yield a single token, so the `list(text)` fallback never ran. Two different CJK strings sharing characters scored 0.0 instead of getting partial credit. - Passing a custom `tokenizer=` makes rouge-score ignore `use_stemmer`, so English stemming was silently dropped (e.g. "running" no longer matched "run"). Now ASCII-majority text is delegated to rouge-score's DefaultTokenizer (preserving Porter stemming and existing behavior exactly), and non-ASCII text keeps Latin/digit runs as words while splitting remaining word characters individually so partial overlap is scored. Verified: Thai exact=1.0, CJK exact=1.0, CJK partial(你好世界 vs 你好朋友)=0.5, English stemming(running fast vs run fast)=1.0, Latin sanity matches default. --- .../adk/evaluation/final_response_match_v1.py | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/google/adk/evaluation/final_response_match_v1.py b/src/google/adk/evaluation/final_response_match_v1.py index 9131529b585..cb113e11bb3 100644 --- a/src/google/adk/evaluation/final_response_match_v1.py +++ b/src/google/adk/evaluation/final_response_match_v1.py @@ -96,24 +96,29 @@ def _get_eval_status(score: float, threshold: float): class _UnicodeTokenizer: """Tokenizer that handles Unicode text with word-boundary awareness. - The default RougeScorer tokenizer splits on whitespace, which works for - ASCII and Latin-script text but produces zero tokens for text in scripts - without word boundaries (Chinese, Japanese, Thai, etc.). - - For ASCII-majority text this tokenizer uses Unicode-aware word-character - matching (``\\w+`` in re). For non-ASCII text it falls back to whitespace - splitting, then character-level tokenization. + The default RougeScorer tokenizer strips characters outside ``[a-z0-9]``, so + text in scripts without Latin word boundaries (Chinese, Japanese, Thai, etc.) + produces zero tokens and scores 0.0 even on an exact match. + + ASCII-majority text is delegated to rouge-score's ``DefaultTokenizer`` so the + existing behavior -- including Porter stemming -- is preserved exactly. For + non-ASCII text, Latin/digit runs are kept as words and each remaining word + character (e.g. a CJK ideograph) becomes its own token, so partial overlap is + scored instead of collapsing into a single opaque token. """ + def __init__(self, use_stemmer: bool = True): + self._default = rouge_scorer.tokenizers.DefaultTokenizer(use_stemmer) + def tokenize(self, text: str) -> list[str]: """Tokenizes text using Unicode-aware word boundaries.""" + text = text.lower() + if not text: + return [] ascii_chars = sum(1 for c in text if ord(c) < 128) if ascii_chars > len(text) * 0.5: - return re.findall(r"\w+", text.lower()) - tokens = text.lower().split() - if tokens: - return tokens - return list(text.lower()) + return self._default.tokenize(text) + return re.findall(r"[a-z0-9]+|\w", text, re.UNICODE) def _calculate_rouge_1_scores(candidate: str, reference: str):