fix: global pipeline context passing

ringoldsdev · ringoldsdev · commit ed0d59c553ad · 2025-07-18T20:08:06.000Z
diff --git a/laygo/pipeline.py b/laygo/pipeline.py
@@ -1,7 +1,10 @@
+# pipeline.py
+
 from collections.abc import Callable
 from collections.abc import Iterable
 from collections.abc import Iterator
 import itertools
+import multiprocessing as mp
 from typing import Any
 from typing import TypeVar
 from typing import overload
@@ -17,34 +20,79 @@
 class Pipeline[T]:
   """
   Manages a data source and applies transformers to it.
-  Provides terminal operations to consume the resulting data.
+  Always uses a multiprocessing-safe shared context.
   """
 
   def __init__(self, *data: Iterable[T]):
     if len(data) == 0:
       raise ValueError("At least one data source must be provided to Pipeline.")
     self.data_source: Iterable[T] = itertools.chain.from_iterable(data) if len(data) > 1 else data[0]
     self.processed_data: Iterator = iter(self.data_source)
-    self.ctx = PipelineContext()
+
+    # Always create a shared context with multiprocessing manager
+    self._manager = mp.Manager()
+    self.ctx = self._manager.dict()
+    # Add a shared lock to the context for safe concurrent updates
+    self.ctx["lock"] = self._manager.Lock()
+
+    # Store reference to original context for final synchronization
+    self._original_context_ref: PipelineContext | None = None
+
+  def __del__(self):
+    """Clean up the multiprocessing manager when the pipeline is destroyed."""
+    try:
+      self._sync_context_back()
+      self._manager.shutdown()
+    except Exception:
+      pass  # Ignore errors during cleanup
 
   def context(self, ctx: PipelineContext) -> "Pipeline[T]":
     """
-    Sets the context for the pipeline.
+    Updates the pipeline context and stores a reference to the original context.
+    When the pipeline finishes processing, the original context will be updated
+    with the final pipeline context data.
     """
-    self.ctx = ctx
+    # Store reference to the original context
+    self._original_context_ref = ctx
+    # Copy the context data to the pipeline's shared context
+    self.ctx.update(ctx)
     return self
 
+  def _sync_context_back(self) -> None:
+    """
+    Synchronize the final pipeline context back to the original context reference.
+    This is called after processing is complete.
+    """
+    if self._original_context_ref is not None:
+      # Copy the final context state back to the original context reference
+      final_context_state = dict(self.ctx)
+      final_context_state.pop("lock", None)  # Remove non-serializable lock
+      self._original_context_ref.clear()
+      self._original_context_ref.update(final_context_state)
+
+  def transform[U](self, t: Callable[[Transformer[T, T]], Transformer[T, U]]) -> "Pipeline[U]":
+    """
+    Shorthand method to apply a transformation using a lambda function.
+    Creates a Transformer under the hood and applies it to the pipeline.
+
+    Args:
+        t: A callable that takes a transformer and returns a transformed transformer
+
+    Returns:
+        A new Pipeline with the transformed data
+    """
+    # Create a new transformer and apply the transformation function
+    transformer = t(Transformer[T, T]())
+    return self.apply(transformer)
+
   @overload
   def apply[U](self, transformer: Transformer[T, U]) -> "Pipeline[U]": ...
 
   @overload
   def apply[U](self, transformer: Callable[[Iterable[T]], Iterator[U]]) -> "Pipeline[U]": ...
 
   @overload
-  def apply[U](
-    self,
-    transformer: Callable[[Iterable[T], PipelineContext], Iterator[U]],
-  ) -> "Pipeline[U]": ...
+  def apply[U](self, transformer: Callable[[Iterable[T], PipelineContext], Iterator[U]]) -> "Pipeline[U]": ...
 
   def apply[U](
     self,
@@ -53,42 +101,26 @@ def apply[U](
     | Callable[[Iterable[T], PipelineContext], Iterator[U]],
   ) -> "Pipeline[U]":
     """
-    Applies a transformer to the current data source.
+    Applies a transformer to the current data source. The pipeline's
+    managed context is passed down.
     """
-
     match transformer:
       case Transformer():
-        # If a Transformer instance is provided, use its __call__ method
+        # The transformer is called with self.ctx, which is the
+        # shared mp.Manager.dict proxy when inside a 'with' block.
         self.processed_data = transformer(self.processed_data, self.ctx)  # type: ignore
       case _ if callable(transformer):
-        # If a callable function is provided, call it with the current data and context
-
         if is_context_aware(transformer):
           processed_transformer = transformer
         else:
           processed_transformer = lambda data, ctx: transformer(data)  # type: ignore  # noqa: E731
-
         self.processed_data = processed_transformer(self.processed_data, self.ctx)  # type: ignore
       case _:
         raise TypeError("Transformer must be a Transformer instance or a callable function")
 
     return self  # type: ignore
 
-  def transform[U](self, t: Callable[[Transformer[T, T]], Transformer[T, U]]) -> "Pipeline[U]":
-    """
-    Shorthand method to apply a transformation using a lambda function.
-    Creates a Transformer under the hood and applies it to the pipeline.
-
-    Args:
-        t: A callable that takes a transformer and returns a transformed transformer
-
-    Returns:
-        A new Pipeline with the transformed data
-    """
-    # Create a new transformer and apply the transformation function
-    transformer = t(Transformer[T, T]())
-    return self.apply(transformer)
-
+  # ... The rest of the Pipeline class (transform, __iter__, to_list, etc.) remains unchanged ...
   def __iter__(self) -> Iterator[T]:
     """Allows the pipeline to be iterated over."""
     yield from self.processed_data
diff --git a/laygo/transformers/parallel.py b/laygo/transformers/parallel.py
@@ -11,6 +11,7 @@
 import copy
 import itertools
 import multiprocessing as mp
+from multiprocessing.managers import DictProxy
 from typing import Any
 from typing import Union
 from typing import overload
@@ -85,30 +86,45 @@ def from_transformer[T, U](
     )
 
   def __call__(self, data: Iterable[In], context: PipelineContext | None = None) -> Iterator[Out]:
-    """Executes the transformer on data concurrently using processes."""
-    with mp.Manager() as manager:
-      initial_ctx_data = context if context is not None else self.context
-      shared_context = manager.dict(initial_ctx_data)
-
-      if "lock" not in shared_context:
-        shared_context["lock"] = manager.Lock()
-
-      try:
-        with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
-          chunks_to_process = self._chunk_generator(data)
-          gen_func = self._ordered_generator if self.ordered else self._unordered_generator
-          processed_chunks_iterator = gen_func(chunks_to_process, executor, shared_context)
-
-          for result_chunk in processed_chunks_iterator:
-            yield from result_chunk
-      finally:
-        if context is not None:
-          final_context_state = dict(shared_context)
-          final_context_state.pop("lock", None)
-          # FIX 2: Do not clear the context, just update it.
-          # This allows chained transformers to merge their context results.
-          # context.clear()
-          context.update(final_context_state)
+    """
+    Executes the transformer on data concurrently. It uses the shared
+    context provided by the Pipeline, if available.
+    """
+    run_context = context if context is not None else self.context
+
+    # Detect if the context is already managed by the Pipeline.
+    is_managed_context = isinstance(run_context, DictProxy)
+
+    if is_managed_context:
+      # Use the existing shared context and lock from the Pipeline.
+      shared_context = run_context
+      yield from self._execute_with_context(data, shared_context)
+      # The context is live, so no need to update it here.
+      # The Pipeline's __exit__ will handle final state.
+    else:
+      # Fallback for standalone use: create a temporary manager.
+      with mp.Manager() as manager:
+        initial_ctx_data = dict(run_context)
+        shared_context = manager.dict(initial_ctx_data)
+        if "lock" not in shared_context:
+          shared_context["lock"] = manager.Lock()
+
+        yield from self._execute_with_context(data, shared_context)
+
+        # Copy results back to the original non-shared context.
+        final_context_state = dict(shared_context)
+        final_context_state.pop("lock", None)
+        run_context.update(final_context_state)
+
+  def _execute_with_context(self, data: Iterable[In], shared_context: MutableMapping[str, Any]) -> Iterator[Out]:
+    """Helper to run the execution logic with a given context."""
+    with ProcessPoolExecutor(max_workers=self.max_workers) as executor:
+      chunks_to_process = self._chunk_generator(data)
+      gen_func = self._ordered_generator if self.ordered else self._unordered_generator
+      processed_chunks_iterator = gen_func(chunks_to_process, executor, shared_context)
+
+      for result_chunk in processed_chunks_iterator:
+        yield from result_chunk
 
   # ... The rest of the file remains the same ...
   def _ordered_generator(