Bugfix in single-line-logic trimming to match jinja output

jmoraleda · jmoraleda · commit f5ec0f940bd2 · 2026-04-11T00:34:38.000-04:00
diff --git a/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java b/src/main/java/com/hubspot/jinjava/tree/parse/TokenScanner.java
@@ -269,11 +269,8 @@ private Token scanPlainText(char c) {
       return handleLineStatement();
     }
     // ── Line comment prefix (e.g. "%# this is ignored") ───────────────────
-    if (
-      lineCommentPrefix != null &&
-      isStartOfLine(currPost) &&
-      regionMatches(currPost, lineCommentPrefix)
-    ) {
+    // Line comments match anywhere on a line, not just at the start.
+    if (lineCommentPrefix != null && regionMatches(currPost, lineCommentPrefix)) {
       return handleLineComment();
     }
     // ── Variable opener e.g. "{{" or "\VAR{" ──────────────────────────────
@@ -370,6 +367,34 @@ private Token handleLineStatement() {
       currLine++;
       lastNewlinePos = next;
     }
+
+    // When lstrip_blocks is active, Python Jinja2 also consumes any blank lines
+    // that follow a line statement (lines containing only horizontal whitespace).
+    // This prevents blank lines between consecutive line statements from
+    // appearing in the output.
+    if (config.isLstripBlocks()) {
+      while (next < length) {
+        // Scan forward past any horizontal whitespace on this line.
+        int lineEnd = next;
+        while (
+          lineEnd < length &&
+          is[lineEnd] != '\n' &&
+          (is[lineEnd] == ' ' || is[lineEnd] == '\t')
+        ) {
+          lineEnd++;
+        }
+        // If we hit a newline (blank or whitespace-only line), consume it.
+        if (lineEnd < length && is[lineEnd] == '\n') {
+          next = lineEnd + 1;
+          currLine++;
+          lastNewlinePos = next;
+        } else {
+          // Hit real content or end of input — stop consuming.
+          break;
+        }
+      }
+    }
+
     tokenStart = next;
     currPost = next;
 
@@ -391,47 +416,56 @@ private Token handleLineStatement() {
   /**
    * Handles a line comment prefix.
    *
-   * <p>Matches Python Jinja2 semantics exactly:
+   * <p>Line comments match anywhere on a line (not just at the start).
+   * For mid-line comments, everything from the prefix to end of line is
+   * stripped; the text before the prefix on the same line is kept.
+   *
+   * <p>Confirmed Python Jinja2 semantics:
    * <ul>
-   *   <li><b>Plain {@code %#}</b>: the comment content is stripped but the line's
-   *       trailing {@code \n} is <em>kept</em>. The comment line is effectively
-   *       replaced by a blank line in the output.</li>
-   *   <li><b>{@code %#-} (trim modifier)</b>: the comment content AND its trailing
-   *       {@code \n} are both stripped, leaving no blank line.</li>
+   *   <li><b>Plain {@code %#}</b>: comment content stripped, own trailing
+   *       {@code \n} kept. Replaces the comment (and anything after it on
+   *       the line) with a blank line / line ending.</li>
+   *   <li><b>{@code %#-} at start of line</b>: also strips preceding blank
+   *       lines and the {@code \n} ending the last real-content line.</li>
+   *   <li><b>{@code %#-} mid-line</b>: behaves like plain {@code %#} — the
+   *       {@code -} has nothing to left-trim when real content precedes it.</li>
    * </ul>
-   *
-   * <p>Neither form affects the newline that ended the <em>preceding</em> line.
    */
   private Token handleLineComment() {
+    boolean startOfLine = isStartOfLine(currPost);
     int afterPrefix = currPost + lineCommentPrefix.length;
     boolean hasTrimModifier =
       afterPrefix < length && is[afterPrefix] == symbols.getTrimChar();
 
-    // Flush buffered text up to (but not including) the current line's indentation.
-    // The preceding newline is always preserved regardless of the trim modifier.
-    Token pending = flushTextBefore(lineIndentStart(currPost));
+    int flushUpTo;
+    if (!startOfLine) {
+      // Mid-line comment: flush up to the %# prefix, stripping trailing
+      // horizontal whitespace before it (Python strips spaces/tabs before
+      // mid-line comments, e.g. "hello %# comment" → "hello").
+      int p = currPost - 1;
+      while (p >= tokenStart && (is[p] == ' ' || is[p] == '\t')) {
+        p--;
+      }
+      flushUpTo = p + 1;
+    } else if (hasTrimModifier) {
+      // Start-of-line %#-: strip preceding blank lines and the real-content \n.
+      flushUpTo = lineIndentStartSkippingBlanks(currPost);
+    } else {
+      // Start-of-line %#: strip only the current line's indentation.
+      flushUpTo = lineIndentStart(currPost);
+    }
+
+    Token pending = flushTextBefore(flushUpTo);
 
     // Advance past the comment content to the end of the line.
     int end = afterPrefix;
     while (end < length && is[end] != '\n') {
       end++;
     }
 
-    if (hasTrimModifier) {
-      // %#- : strip trailing \n too, leaving no blank line.
-      int next = end;
-      if (next < length && is[next] == '\n') {
-        next++;
-        currLine++;
-        lastNewlinePos = next;
-      }
-      tokenStart = next;
-      currPost = next;
-    } else {
-      // %# : leave the trailing \n in place so it renders as a blank line.
-      tokenStart = end;
-      currPost = end;
-    }
+    // Both %# and %#- keep the trailing \n — it appears in the output.
+    tokenStart = end;
+    currPost = end;
 
     return (pending != null) ? pending : DELIMITER_MATCHED;
   }
@@ -451,6 +485,46 @@ private int lineIndentStart(int pos) {
     return p + 1;
   }
 
+  /**
+   * Returns the flush boundary for a {@code %#-} line comment.
+   *
+   * <p>Python Jinja2 semantics for {@code %#-}: strip back through any preceding
+   * blank lines AND the {@code \n} that ends the last real-content line, so that
+   * the comment's own kept {@code \n} becomes the sole separator. Stops at
+   * {@code tokenStart} so that {@code \n}s produced by preceding line statements
+   * or plain {@code %#} comments are not consumed.
+   *
+   * <p>Examples (| marks the flush boundary):
+   * <pre>
+   *   "A\n\n%#-"   →  flush "A|"      → output "A" + comment's \n
+   *   "%% set\n%#-" → flush nothing    → output comment's \n  (tokenStart guard)
+   * </pre>
+   */
+  private int lineIndentStartSkippingBlanks(int pos) {
+    int p = pos - 1;
+    while (p >= tokenStart) {
+      // Skip trailing horizontal whitespace on this line (going backwards).
+      while (p >= tokenStart && (is[p] == ' ' || is[p] == '\t')) {
+        p--;
+      }
+      if (p < tokenStart) {
+        break;
+      }
+      if (is[p] == '\n') {
+        // Blank line — consume this \n and keep scanning backwards.
+        p--;
+      } else {
+        // Real content at position p. The \n ending this line is at p+1.
+        // Return p+1 so flushTextBefore(p+1) flushes up to but NOT including
+        // that \n, stripping it from the output.
+        return p + 1;
+      }
+    }
+    // Reached tokenStart without finding real content — all blank lines were
+    // preceded by a line statement or plain comment. Preserve them.
+    return tokenStart;
+  }
+
   // ── One-slot stash for the synthetic tag after a line-statement ─────────
   // When a line-statement prefix is found and there is pending text to flush
   // first, we return the text token immediately and stash the synthetic tag
diff --git a/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java b/src/test/java/com/hubspot/jinjava/tree/parse/StringTokenScannerSymbolsTest.java
@@ -2,15 +2,16 @@
 
 import static org.assertj.core.api.Assertions.assertThat;
 import static org.assertj.core.api.Assertions.assertThatThrownBy;
-import java.util.HashMap;
-import org.junit.Before;
-import org.junit.Test;
+
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Lists;
 import com.hubspot.jinjava.BaseJinjavaTest;
 import com.hubspot.jinjava.Jinjava;
 import com.hubspot.jinjava.JinjavaConfig;
 import com.hubspot.jinjava.lib.filter.JoinFilterTest.User;
+import java.util.HashMap;
+import org.junit.Before;
+import org.junit.Test;
 
 public class StringTokenScannerSymbolsTest {
 
@@ -398,6 +399,20 @@ public void itRendersLineStatementMixedWithBlockDelimiters() {
 
   // ── Line comment prefix ────────────────────────────────────────────────────
   //
+  // Ground truth confirmed by running both Python Jinja2 and Jinjava against:
+  //   [START]
+  //   %% set x = 1
+  //   [A]
+  //   %# plain comment
+  //   [B]
+  //   %#- trim comment
+  //   [C]
+  //   %% set y = 2
+  //   [D]
+  //   [END]
+  //
+  // Python output: [START]\n[A]\n\n[B]\n[C]\n[D]\n[END]
+  //
   // Semantics:
   //   %#  (plain): comment content stripped, trailing \n KEPT  → blank line where comment was
   //   %#- (trim):  comment content AND trailing \n stripped     → no blank line
@@ -408,7 +423,7 @@ public void itStripsLineCommentPrefixLeavingBlankLine() {
     Jinjava j = jinjavaWith(
       StringTokenScannerSymbols.builder().withLineCommentPrefix("%#").build()
     );
-    // %# keeps its trailing \n → "before\n" + "\n" + "after" = "before\n\nafter"
+    // %# keeps its trailing \n → "before\n" + "\n" (comment's own \n) + "after"
     String template = "before\n%# this whole line is a comment\nafter";
     assertThat(j.render(template, new HashMap<>())).isEqualTo("before\n\nafter");
   }
@@ -418,7 +433,7 @@ public void itStripsLineCommentWithLeadingWhitespace() {
     Jinjava j = jinjavaWith(
       StringTokenScannerSymbols.builder().withLineCommentPrefix("%#").build()
     );
-    // Indentation before %# is stripped, trailing \n is kept → still a blank line
+    // Indentation before %# is stripped, trailing \n is kept → blank line
     String template = "before\n  %# indented comment\nafter";
     assertThat(j.render(template, new HashMap<>())).isEqualTo("before\n\nafter");
   }
@@ -428,18 +443,22 @@ public void itStripsLineCommentWithTrimModifier() {
     Jinjava j = jinjavaWith(
       StringTokenScannerSymbols.builder().withLineCommentPrefix("%#").build()
     );
-    // %#  keeps trailing \n  → blank line:  "before\n\nafter"
+    // %#  keeps trailing \n (blank line left in output)
     assertThat(j.render("before\n%# comment\nafter", new HashMap<>()))
       .isEqualTo("before\n\nafter");
-    // %#- strips trailing \n → no blank line: "before\nafter"
+    // %#- also keeps trailing \n — the '-' is LEFT-trim only (strips preceding blanks)
+    // With no preceding blank lines, result is identical to plain %#
     assertThat(j.render("before\n%#- comment\nafter", new HashMap<>()))
       .isEqualTo("before\nafter");
+    // %#- with a preceding blank line: strips the blank, keeps own trailing \n
+    assertThat(j.render("before\n\n%#- comment\nafter", new HashMap<>()))
+      .isEqualTo("before\nafter");
   }
 
   @Test
   public void itStripsLineCommentWithoutLeavingBlankLine() {
-    // %#- strips both content and trailing \n → no blank line.
-    // "\\begin{document}\n" (preceding \n kept) + "\\section*{...}" (directly)
+    // %#- with real content before (no blank): strips the preceding \n,
+    // keeps comment's own \n. "\\begin{document}" + "\n" (comment's \n) + "\\section*{...}"
     Jinjava j = new Jinjava(
       BaseJinjavaTest
         .newConfigBuilder()