From 0ff28af2dee787f6bd9a40cb8ac3a1bd8a41a4e8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E9=81=93=E8=BE=95?= <daoyuanchen.cdy@alibaba-inc.com>
Date: Fri, 13 Feb 2026 14:59:32 +0800
Subject: [PATCH] feat: Add combined_logical_filter operator with AND/OR
 support

- Add CombinedLogicalFilter operator for explicit filter composition
- Support AND/OR logical operations between multiple filters
- Enable OR operations that were impossible with sequential filters
- Optimize performance: single batch stats computation, no intermediate data reduction
- Extend FusedFilter to support logical_op parameter (AND/OR)
- Add comprehensive tests covering AND, OR, edge cases, and error handling
- Add detailed documentation with performance comparisons and use cases

Key advantages:
1. OR operation support (previously impossible with sequential filters)
2. Performance optimization (reduced data reorganization, optimized stats computation)
3. Explicit logical relationships (self-documenting configuration)
4. Complementary to FusedFilter (automatic optimization + explicit control)
---
 data_juicer/ops/filter/__init__.py            |   2 +
 .../ops/filter/combined_logical_filter.py     | 234 ++++++++++++
 data_juicer/ops/op_fusion.py                  | 135 ++++---
 docs/Operators.md                             |   3 +-
 .../filter/combined_logical_filter.md         | 348 ++++++++++++++++++
 .../filter/test_combined_logical_filter.py    | 177 +++++++++
 6 files changed, 857 insertions(+), 42 deletions(-)
 create mode 100644 data_juicer/ops/filter/combined_logical_filter.py
 create mode 100644 docs/operators/filter/combined_logical_filter.md
 create mode 100644 tests/ops/filter/test_combined_logical_filter.py

diff --git a/data_juicer/ops/filter/__init__.py b/data_juicer/ops/filter/__init__.py
index 2825ed1c01a..1265d1707c6 100644
--- a/data_juicer/ops/filter/__init__.py
+++ b/data_juicer/ops/filter/__init__.py
@@ -4,6 +4,7 @@
 from .audio_size_filter import AudioSizeFilter
 from .average_line_length_filter import AverageLineLengthFilter
 from .character_repetition_filter import CharacterRepetitionFilter
+from .combined_logical_filter import CombinedLogicalFilter
 from .flagged_words_filter import FlaggedWordFilter
 from .general_field_filter import GeneralFieldFilter
 from .image_aesthetics_filter import ImageAestheticsFilter
@@ -78,6 +79,7 @@
     "ImageTextSimilarityFilter",
     "ImageWatermarkFilter",
     "LanguageIDScoreFilter",
+    "CombinedLogicalFilter",
     "InContextInfluenceFilter",
     "InstructionFollowingDifficultyFilter",
     "LLMAnalysisFilter",
diff --git a/data_juicer/ops/filter/combined_logical_filter.py b/data_juicer/ops/filter/combined_logical_filter.py
new file mode 100644
index 00000000000..9aa8edd158d
--- /dev/null
+++ b/data_juicer/ops/filter/combined_logical_filter.py
@@ -0,0 +1,234 @@
+"""
+Combined Logical Filter Operator.
+
+A composition operator that combines multiple filter operators with
+logical operations (AND/OR). This operator is more explicit and
+self-documenting than using separate filters in sequence.
+"""
+
+from typing import Dict, List
+
+import numpy as np
+
+from ...utils.constant import Fields
+from ..base_op import OPERATORS, Filter
+from ..load import load_ops
+
+OP_NAME = "combined_logical_filter"
+
+
+@OPERATORS.register_module(OP_NAME)
+class CombinedLogicalFilter(Filter):
+    """A combined filter operator that applies multiple filter operators
+    with logical operations (AND/OR).
+
+    This is a composition operator that combines multiple filter operators
+    and applies a logical operation (AND or OR) to their results. It's
+    more explicit and self-documenting than using separate filters in
+    sequence.
+
+    Features:
+    - Supports AND/OR logical operations
+    - Handles both batched and single-sample processing
+    - Compatible with Ray executor
+    - Automatically handles context variables for intermediate operations
+    - Supports CUDA-accelerated filters
+
+    组合型过滤算子，将多个过滤算子组合并应用逻辑运算（AND/OR）。
+
+    这是一个组合算子，将多个过滤算子组合并对其结果应用逻辑运算（AND 或 OR）。
+    比在序列中使用单独的过滤器更明确和自文档化。
+
+    特性：
+    - 支持 AND/OR 逻辑运算
+    - 处理批处理和单样本处理
+    - 兼容 Ray 执行器
+    - 自动处理中间操作的上下文变量
+    - 支持 CUDA 加速的过滤器
+    """
+
+    _batched_op = True
+
+    def __init__(
+        self,
+        filter_ops: List[Dict[str, dict]],
+        logical_op: str = "and",
+        *args,
+        **kwargs,
+    ):
+        """
+        Initialization method.
+
+        :param filter_ops: A list of filter operator configurations.
+            Each item should be a dict with operator name as key and its
+            parameters as value. Example: [{"text_length_filter":
+            {"min_len": 10, "max_len": 100}}, {"language_id_score_filter":
+            {"lang": "zh", "min_score": 0.8}}]
+        :param logical_op: The logical operator to combine filter results.
+            Can be "and" or "or". Default is "and". When "and" is used,
+            a sample is kept only if all filters return True. When "or"
+            is used, a sample is kept if any filter returns True.
+        :param args: extra args
+        :param kwargs: extra args
+        """
+        super().__init__(*args, **kwargs)
+        if not filter_ops:
+            raise ValueError("filter_ops cannot be empty. " "At least one filter operator is required.")
+        if logical_op.lower() not in ["and", "or"]:
+            raise ValueError(f"logical_op must be 'and' or 'or', got '{logical_op}'")
+
+        self.logical_op = logical_op.lower()
+        # Load filter operators from configuration
+        self.filter_ops = load_ops(filter_ops)
+
+        # Verify all loaded operators are Filter instances
+        for i, op in enumerate(self.filter_ops):
+            if not isinstance(op, Filter):
+                raise ValueError(
+                    f"All operators in filter_ops must be Filter instances, " f"got {type(op).__name__} at index {i}"
+                )
+
+        # Set accelerator to 'cuda' if any of the filters use it
+        accelerator_methods = set([op.accelerator for op in self.filter_ops])
+        if "cuda" in accelerator_methods:
+            self.accelerator = "cuda"
+
+        # Set num_proc to the minimum of all filters
+        # This ensures compatibility when filters have different
+        # num_proc requirements
+        self.num_proc = min([op.runtime_np() for op in self.filter_ops])
+
+    def compute_stats_batched(self, samples, rank=None, context=False):
+        """Compute stats for all filter operators in batch mode.
+
+        This method applies all filter operators sequentially to compute
+        statistics. Context variables are automatically handled for
+        operators that need them.
+
+        :param samples: Batch of samples in dict-of-lists format
+        :param rank: Rank for CUDA operations (used when any filter
+            uses CUDA)
+        :param context: Whether to use context for intermediate
+            variables
+        :return: Samples with computed statistics
+        """
+        # Context for intermediate variables
+        num_samples = len(samples[Fields.stats])
+        if Fields.context not in samples:
+            samples[Fields.context] = [{} for _ in range(num_samples)]
+
+        # Apply all filter operators to compute stats
+        for op in self.filter_ops:
+            process_args = {}
+            if op.accelerator == "cuda":
+                process_args["rank"] = rank
+            needs_context = context or (hasattr(op, "compute_stats_batched") and op.accelerator == "cuda")
+            if needs_context:
+                # Check if the operator needs context
+                from data_juicer.utils.common_utils import check_op_method_param
+
+                if check_op_method_param(op.compute_stats, "context"):
+                    process_args["context"] = True
+
+            if hasattr(op, "compute_stats_batched"):
+                samples = op.compute_stats_batched(samples, **process_args)
+            else:
+                # Fallback to single sample processing for non-batched ops
+                keys = samples.keys()
+                for i in range(num_samples):
+                    this_sample = {key: samples[key][i] for key in keys}
+                    context_flag = process_args.get("context", False)
+                    res_sample = op.compute_stats_single(this_sample, context=context_flag)
+                    samples[Fields.stats][i] = res_sample[Fields.stats]
+                    # Preserve context if it was modified
+                    if Fields.context in res_sample:
+                        samples[Fields.context][i].update(res_sample[Fields.context])
+
+        return samples
+
+    def process_batched(self, samples):
+        """Process samples by combining results from all filter operators
+        in batch mode.
+
+        This method applies all filters and combines their results using
+        the specified logical operation (AND or OR).
+
+        :param samples: Batch of samples in dict-of-lists format
+        :return: List of boolean values indicating which samples to keep
+        """
+        # Get results from all filter operators
+        all_results = []
+        for op in self.filter_ops:
+            if hasattr(op, "process_batched"):
+                results = list(op.process_batched(samples))
+            else:
+                # Fallback to single sample processing for non-batched ops
+                results = [op.process_single({Fields.stats: stat}) for stat in samples[Fields.stats]]
+            all_results.append(np.array(results, dtype=bool))
+
+        # Combine results based on logical operator
+        if len(all_results) == 0:
+            # Edge case: no filters (should not happen due to validation,
+            # but handle gracefully)
+            return [True] * len(samples[Fields.stats])
+
+        combined_result = all_results[0]
+        for result in all_results[1:]:
+            if self.logical_op == "and":
+                combined_result = np.logical_and(combined_result, result)
+            else:  # or
+                combined_result = np.logical_or(combined_result, result)
+
+        return combined_result.tolist()
+
+    def compute_stats_single(self, sample, context=False):
+        """Compute stats for a single sample using all filter operators.
+
+        :param sample: Single sample in dict format
+        :param context: Whether to use context for intermediate variables
+        :return: Sample with computed statistics
+        """
+        # Apply all filter operators to compute stats
+        for op in self.filter_ops:
+            if hasattr(op, "compute_stats_single"):
+                sample = op.compute_stats_single(sample, context=context)
+            else:
+                # For batched-only operators, we cannot compute stats for
+                # a single sample. This is a limitation - batched-only
+                # operators should implement compute_stats_single or we need
+                # to create a minimal batch. For now, we skip stats
+                # computation for batched-only operators in single mode.
+                # This is acceptable because process_single will handle the
+                # fallback.
+                pass
+        return sample
+
+    def process_single(self, sample: Dict) -> bool:
+        """Process a single sample by combining results from all filter
+        operators.
+
+        :param sample: Single sample in dict format
+        :return: Boolean indicating whether to keep the sample
+        """
+        # Get results from all filter operators
+        results = []
+        for op in self.filter_ops:
+            if hasattr(op, "process_single"):
+                result = op.process_single(sample)
+            else:
+                # For batched-only operators, create a minimal batch
+                stat = sample.get(Fields.stats, {})
+                batch_samples = {Fields.stats: [stat]}
+                # Also preserve other fields if they exist
+                for key in sample:
+                    if key != Fields.stats:
+                        batch_samples[key] = [sample[key]]
+                batch_result = list(op.process_batched(batch_samples))
+                result = batch_result[0] if batch_result else True
+            results.append(result)
+
+        # Combine results based on logical operator
+        if self.logical_op == "and":
+            return all(results)
+        else:  # or
+            return any(results)
diff --git a/data_juicer/ops/op_fusion.py b/data_juicer/ops/op_fusion.py
index a4a8967f3f9..0c0a033de90 100644
--- a/data_juicer/ops/op_fusion.py
+++ b/data_juicer/ops/op_fusion.py
@@ -25,7 +25,14 @@
 INTER_SAMPLED_FRAMES = Registry(InterVars.sampled_frames)
 
 # all
-ALL_INTER_VARS = [INTER_LINES, INTER_WORDS, LOADED_AUDIOS, LOADED_IMAGES, LOADED_VIDEOS, INTER_SAMPLED_FRAMES]
+ALL_INTER_VARS = [
+    INTER_LINES,
+    INTER_WORDS,
+    LOADED_AUDIOS,
+    LOADED_IMAGES,
+    LOADED_VIDEOS,
+    INTER_SAMPLED_FRAMES,
+]
 
 # supported fusion strategies
 FUSION_STRATEGIES = {"greedy", "probe"}
@@ -80,10 +87,11 @@ def fuse_filter_group(original_filter_group):
                 all_fused_filters[inter_vars].append((op, probe_res))
                 break
         else:
-            # first apply other filters to decrease the number of samples, so
-            # we add them into the fused_group list directly
+            # first apply other filters to decrease the number of samples,
+            # so we add them into the fused_group list directly
             fused_group.append(op)
-            group_speed.append(probe_res["speed"] if probe_res else 0)
+            speed = probe_res["speed"] if probe_res else 0
+            group_speed.append(speed)
 
     # try to fuse ops for each type of intermediate vars
     for inter_vars in all_intermediate_vars:
@@ -92,18 +100,18 @@ def fuse_filter_group(original_filter_group):
             # no ops include this type of intermediate var
             pass
         elif len(inter_vars_filter) > 1:
-            # more than 1 ops share the same intermediate var, try to fuse them
+            # more than 1 ops share the same intermediate var, try to fuse
             ops, probe_res_list = zip(*inter_vars_filter)
             # new definition: new name and a definition list of fused op list
-            fused_filter_name = "OpFusion:(%s)" % ",".join([op._name for op in ops])
+            op_names = [op._name for op in ops]
+            fused_filter_name = "OpFusion:(%s)" % ",".join(op_names)
             logger.info(f"Ops are fused into one op " f"{fused_filter_name}.")
             # use these ops to create a FusedFilter object, and add the fused
             # definition and op into the fused group
             fused_filter = FusedFilter(fused_filter_name, ops)
             fused_filter._op_cfg = {fused_filter_name: [op._op_cfg for op in ops]}
-            fused_filter_speed = sum([1.0 / probe_res["speed"] for probe_res in probe_res_list if probe_res])
-            if fused_filter_speed > 0:
-                fused_filter_speed = 1.0 / fused_filter_speed
+            speed_sum = sum([1.0 / probe_res["speed"] for probe_res in probe_res_list if probe_res])
+            fused_filter_speed = 1.0 / speed_sum if speed_sum > 0 else 0
             fused_group.append(fused_filter)
             group_speed.append(fused_filter_speed)
         else:
@@ -114,37 +122,55 @@ def fuse_filter_group(original_filter_group):
             group_speed.append(probe_res["speed"] if probe_res else 0)
 
     # reorder according to the probed speed results in group_speed
-    # 'greedy': all speed data in group_speed will be 0, which will keep the
-    #   current order of fused group
-    # 'probe': OPs in fused group will be reordered according to the speed data
-    #   in group_speed in descending order
-    fused_group = [op for op, _ in sorted(zip(fused_group, group_speed), key=lambda it: it[1], reverse=True)]
+    # 'greedy': all speed data in group_speed will be 0, which will keep
+    #   the current order of fused group
+    # 'probe': OPs in fused group will be reordered according to the speed
+    #   data in group_speed in descending order
+    sorted_pairs = sorted(zip(fused_group, group_speed), key=lambda it: it[1], reverse=True)
+    fused_group = [op for op, _ in sorted_pairs]
 
     return fused_group
 
 
 class FusedFilter(Filter):
-    """A fused operator for filters."""
+    """A fused operator for filters.
+
+    This is automatically created by the framework when op_fusion is enabled.
+    It fuses multiple filters that share the same intermediate variables for
+    performance optimization. By default, it uses AND logic (all filters must
+    pass). For OR logic or explicit control, use combined_logical_filter
+    instead.
+    """
 
     _batched_op = True
 
-    def __init__(self, name: str, fused_filters: List):
+    def __init__(self, name: str, fused_filters: List, logical_op: str = "and"):
         """
         Initialization method.
 
+        :param name: Name of the fused filter.
         :param fused_filters: a list of filters to be fused.
+        :param logical_op: The logical operator to combine filter results.
+            Can be "and" or "or". Default is "and" for automatic fusion.
+            Note: Automatic fusion always uses AND. OR is only available
+            when explicitly creating FusedFilter instances.
         """
         self._name = name
         super().__init__()
         self.fused_filters = fused_filters
-        # set accelerator to 'cuda' if there exists any ops whose accelerator
-        # is 'cuda'
+        if logical_op.lower() not in ["and", "or"]:
+            raise ValueError(f"logical_op must be 'and' or 'or', " f"got '{logical_op}'")
+        self.logical_op = logical_op.lower()
+
+        # set accelerator to 'cuda' if there exists any ops whose
+        # accelerator is 'cuda'
         accelerator_methods = set([op.accelerator for op in self.fused_filters])
         if "cuda" in accelerator_methods:
             self.accelerator = "cuda"
 
         # update num_proc with the min num_proc of all fusible filters
-        self.num_proc = min([op.runtime_np() for op in self.fused_filters])
+        runtime_nps = [op.runtime_np() for op in self.fused_filters]
+        self.num_proc = min(runtime_nps)
 
     def compute_stats_batched(self, samples, rank=None):
         import av
@@ -169,12 +195,19 @@ def compute_stats_batched(self, samples, rank=None):
         return samples
 
     def process_batched(self, samples):
-        # Only return True when all filters return True
+        """Process samples by combining results from all fused filters.
+
+        :param samples: Batch of samples in dict-of-lists format
+        :return: Boolean array indicating which samples to keep
+        """
         res = None
         for op in self.fused_filters:
             this_res = np.array(list(op.process_batched(samples)))
             if res is not None:
-                res = np.logical_and(res, this_res)
+                if self.logical_op == "and":
+                    res = np.logical_and(res, this_res)
+                else:  # or
+                    res = np.logical_or(res, this_res)
             else:
                 res = this_res
         return res
@@ -182,22 +215,31 @@ def process_batched(self, samples):
 
 @OPERATORS.register_module("general_fused_op")
 class GeneralFusedOP(Mapper):
-    """An explicitly fused operator designed to execute multiple sequential operations (OPs) on
-    the same batch, enabling fine-grained control over data processing.
-
-    This operator allows for the chaining of multiple data processing steps, such as mappers
-    and filters, into a single pass. It processes each batch of samples sequentially through
-    the defined operations, ensuring that all specified transformations are applied in
-    order. The operator supports both mappers, which transform data, and filters, which
-    remove or keep samples based on computed statistics. Context variables can be passed
-    between operations if needed. The accelerator is set to 'cuda' if any of the fused
-    operations use it. The number of processes is determined by the minimum value among all
-    fused operations. After processing, any temporary context variables, such as those used
-    for video containers, are cleaned up."""
+    """An explicitly fused operator designed to execute multiple sequential
+    operations (OPs) on the same batch, enabling fine-grained control over
+    data processing.
+
+    This operator allows for the chaining of multiple data processing steps,
+    such as mappers and filters, into a single pass. It processes each batch
+    of samples sequentially through the defined operations, ensuring that all
+    specified transformations are applied in order. The operator supports
+    both mappers, which transform data, and filters, which remove or keep
+    samples based on computed statistics. Context variables can be passed
+    between operations if needed. The accelerator is set to 'cuda' if any
+    of the fused operations use it. The number of processes is determined
+    by the minimum value among all fused operations. After processing, any
+    temporary context variables, such as those used for video containers,
+    are cleaned up."""
 
     _batched_op = True
 
-    def __init__(self, batch_size: int = 1, fused_op_list: Optional[List] = None, *args, **kwargs):
+    def __init__(
+        self,
+        batch_size: int = 1,
+        fused_op_list: Optional[List] = None,
+        *args,
+        **kwargs,
+    ):
         """
         Initialization.
 
@@ -209,15 +251,21 @@ def __init__(self, batch_size: int = 1, fused_op_list: Optional[List] = None, *a
         if fused_op_list is None:
             fused_op_list = []
         self.fused_ops = load_ops(fused_op_list)
-        self._name = "GeneralFusedOP:(%s)" % ",".join([op._name for op in self.fused_ops])
-        # set accelerator to 'cuda' if there exists any ops whose accelerator
-        # is 'cuda'
+        op_names = [op._name for op in self.fused_ops]
+        name_str = ",".join(op_names)
+        self._name = f"GeneralFusedOP:({name_str})"
+        # set accelerator to 'cuda' if there exists any ops whose
+        # accelerator is 'cuda'
         accelerator_methods = set([op.accelerator for op in self.fused_ops])
         if "cuda" in accelerator_methods:
             self.accelerator = "cuda"
 
         # update num_proc with the min num_proc of all fusible filters
-        self.num_proc = min([op.runtime_np() for op in self.fused_ops]) if self.fused_ops else 1
+        if self.fused_ops:
+            runtime_nps = [op.runtime_np() for op in self.fused_ops]
+            self.num_proc = min(runtime_nps)
+        else:
+            self.num_proc = 1
 
     def process_batched(self, samples, rank=None):
         from copy import deepcopy
@@ -235,12 +283,14 @@ def process_batched(self, samples, rank=None):
             process_args = {"rank": rank} if op.accelerator == "cuda" else {}
             if isinstance(op, Mapper):
                 if check_op_method_param(op.process, "context"):
-                    # add context param only when the core process method of this OP contains this param
+                    # add context param only when the core process method
+                    # of this OP contains this param
                     process_args["context"] = True
                 samples = op.process_batched(tmp_samples, **process_args)
             elif isinstance(op, Filter):
                 if check_op_method_param(op.compute_stats, "context"):
-                    # add context param only when the core process method of this OP contains this param
+                    # add context param only when the core process method
+                    # of this OP contains this param
                     process_args["context"] = True
                 tmp_samples = op.compute_stats_batched(tmp_samples, **process_args)
                 indicators = list(op.process_batched(tmp_samples))
@@ -257,7 +307,10 @@ def process_batched(self, samples, rank=None):
         # check if there are containers that need to be closed
         for ctx in tmp_samples[Fields.context]:
             for context_key in ctx:
-                if isinstance(ctx[context_key], av.container.InputContainer):
+                if isinstance(
+                    ctx[context_key],
+                    av.container.InputContainer,
+                ):
                     ctx[context_key].streams.video[0].close()
                     ctx[context_key].close()
         _ = tmp_samples.pop(Fields.context)
diff --git a/docs/Operators.md b/docs/Operators.md
index 7a7918ec3b5..58854707a1b 100644
--- a/docs/Operators.md
+++ b/docs/Operators.md
@@ -43,7 +43,7 @@ Data-Juicer 中的算子分为以下 8 种类型。
 |------|:------:|-------------|
 | [aggregator](#aggregator) | 4 | Aggregate for batched samples, such as summary or conclusion. 对批量样本进行汇总，如得出总结或结论。 |
 | [deduplicator](#deduplicator) | 10 | Detects and removes duplicate samples. 识别、删除重复样本。 |
-| [filter](#filter) | 56 | Filters out low-quality samples. 过滤低质量样本。 |
+| [filter](#filter) | 57 | Filters out low-quality samples. 过滤低质量样本。 |
 | [formatter](#formatter) | 8 | Discovers, loads, and canonicalizes source data. 发现、加载、规范化原始数据。 |
 | [grouper](#grouper) | 3 | Group samples to batched samples. 将样本分组，每一组组成一个批量样本。 |
 | [mapper](#mapper) | 103 | Edits and transforms samples. 对数据样本进行编辑和转换。 |
@@ -104,6 +104,7 @@ All the specific operators are listed below, each featured with several capabili
 | audio_size_filter | 📣Audio 💻CPU 🟢Stable | Keep data samples based on the size of their audio files. 根据音频文件的大小保留数据样本。 | [info](operators/filter/audio_size_filter.md) | - |
 | average_line_length_filter | 🔤Text 💻CPU 🟢Stable | Filter to keep samples with average line length within a specific range. 过滤器，以保持平均线长度在特定范围内的样本。 | [info](operators/filter/average_line_length_filter.md) | - |
 | character_repetition_filter | 🔤Text 💻CPU 🟢Stable | Filter to keep samples with character-level n-gram repetition ratio within a specific range. 过滤器将具有字符级n-gram重复比的样本保持在特定范围内。 | [info](operators/filter/character_repetition_filter.md) | - |
+| combined_logical_filter | 💻CPU 🟡Beta | A combined filter operator that applies multiple filter operators with logical operations (AND/OR). 将多个筛选运算符应用于逻辑运算 (AND/OR) 的组合筛选运算符。 | [info](operators/filter/combined_logical_filter.md) | - |
 | flagged_words_filter | 🔤Text 💻CPU 🟢Stable | Filter to keep samples with flagged-word ratio in a specified range. 过滤器将标记词比率的样本保留在指定范围内。 | [info](operators/filter/flagged_words_filter.md) | - |
 | general_field_filter | 💻CPU 🟡Beta | Filter to keep samples based on a general field filter condition. 根据常规字段筛选条件保留样本。 | [info](operators/filter/general_field_filter.md) | - |
 | image_aesthetics_filter | 🏞Image 🚀GPU 🧩HF 🟢Stable | Filter to keep samples with aesthetics scores within a specific range. 过滤以保持美学分数在特定范围内的样品。 | [info](operators/filter/image_aesthetics_filter.md) | - |
diff --git a/docs/operators/filter/combined_logical_filter.md b/docs/operators/filter/combined_logical_filter.md
new file mode 100644
index 00000000000..6c9e99be241
--- /dev/null
+++ b/docs/operators/filter/combined_logical_filter.md
@@ -0,0 +1,348 @@
+# combined_logical_filter
+
+A combined filter operator that applies multiple filter operators with logical operations (AND/OR).
+
+This is a composition operator that combines multiple filter operators and applies a logical operation (AND or OR) to their results. It's more explicit and self-documenting than using separate filters in sequence.
+
+组合型过滤算子，将多个过滤算子组合并应用逻辑运算（AND/OR）。
+
+这是一个组合算子，将多个过滤算子组合并对其结果应用逻辑运算（AND 或 OR）。比在序列中使用单独的过滤器更明确和自文档化。
+
+Type 算子类型: **filter**
+
+Tags 标签: cpu, gpu
+
+## 🔧 Parameter Configuration 参数配置
+| name 参数名 | type 类型 | default 默认值 | desc 说明 |
+|--------|------|--------|------|
+| `filter_ops` | <class 'list'> | `[]` | A list of filter operator configurations. Each item should be a dict with operator name as key and its parameters as value. Example: [{"text_length_filter": {"min_len": 10, "max_len": 100}}, {"language_id_score_filter": {"lang": "zh", "min_score": 0.8}}] |
+| `logical_op` | <class 'str'> | `'and'` | The logical operator to combine filter results. Can be "and" or "or" (case-insensitive). When "and" is used, a sample is kept only if all filters return True. When "or" is used, a sample is kept if any filter returns True. |
+| `args` |  | `''` | extra args |
+| `kwargs` |  | `''` | extra args |
+
+## 📊 Effect demonstration 效果演示
+
+### Example 1: AND operation (默认)
+```python
+CombinedLogicalFilter(
+    filter_ops=[
+        {"text_length_filter": {"min_len": 10, "max_len": 100}},
+        {"language_id_score_filter": {"lang": "zh", "min_score": 0.8}}
+    ],
+    logical_op="and"
+)
+```
+
+This configuration will keep only samples that:
+- Have text length between 10 and 100 characters **AND**
+- Have Chinese language score >= 0.8
+
+此配置将只保留同时满足以下条件的样本：
+- 文本长度在10到100个字符之间 **且**
+- 中文语言得分 >= 0.8
+
+### Example 2: OR operation
+```python
+CombinedLogicalFilter(
+    filter_ops=[
+        {"text_length_filter": {"min_len": 10, "max_len": 50}},
+        {"text_length_filter": {"min_len": 100, "max_len": 200}}
+    ],
+    logical_op="or"
+)
+```
+
+This configuration will keep samples that:
+- Have text length between 10 and 50 characters **OR**
+- Have text length between 100 and 200 characters
+
+此配置将保留满足以下任一条件的样本：
+- 文本长度在10到50个字符之间 **或**
+- 文本长度在100到200个字符之间
+
+### Example 3: Configuration file usage (配置文件使用示例)
+
+In a YAML configuration file:
+
+```yaml
+process:
+  - combined_logical_filter:
+      filter_ops:
+        - text_length_filter:
+            min_len: 10
+            max_len: 1000
+        - language_id_score_filter:
+            lang: 'zh'
+            min_score: 0.8
+      logical_op: 'and'
+```
+
+## 💡 Design Rationale 设计理由
+
+This operator is designed as a **composition operator** rather than a simple filter. It explicitly combines multiple filters with a logical operation, making the intent clear and self-documenting.
+
+这个算子被设计为一个**组合算子**而不是简单的过滤器。它明确地将多个过滤器与逻辑运算组合，使意图清晰且自文档化。
+
+### Comparison with Sequential Filters 与串联过滤器的对比
+
+**Sequential Filters 串联过滤器**:
+```yaml
+process:
+  - text_length_filter: {min_len: 10, max_len: 50}
+  - text_length_filter: {min_len: 100, max_len: 200}
+```
+- ❌ **Implicit AND only**: Filters are applied sequentially, each filtering the result of the previous one. This is equivalent to AND logic.
+- ❌ **Cannot implement OR**: There's no way to keep samples that satisfy **either** condition.
+- ❌ **Unclear intent**: The logical relationship is implicit and not self-documenting.
+- ⚠️ **Performance**: Each filter processes data independently, with intermediate data reduction and reorganization.
+
+- ❌ **仅隐式 AND**：过滤器按顺序应用，每个过滤器过滤前一个的结果。这相当于 AND 逻辑。
+- ❌ **无法实现 OR**：无法保留满足**任一**条件的样本。
+- ❌ **意图不明确**：逻辑关系是隐式的，不自文档化。
+- ⚠️ **性能**：每个过滤器独立处理数据，需要中间数据缩减和重组。
+
+**Combined Logical Filter 组合逻辑过滤器**:
+```yaml
+process:
+  - combined_logical_filter:
+      filter_ops:
+        - text_length_filter: {min_len: 10, max_len: 50}
+        - text_length_filter: {min_len: 100, max_len: 200}
+      logical_op: 'or'  # ✅ Now possible!
+```
+- ✅ **Explicit logic**: AND/OR relationship is clearly specified.
+- ✅ **OR support**: Can now implement OR operations that sequential version impossible.
+- ✅ **Self-documenting**: The intent is clear from the configuration.
+- ✅ **Performance optimized**: All filters compute stats in one batch, avoiding intermediate data reorganization.
+
+- ✅ **显式逻辑**：AND/OR 关系明确指定。
+- ✅ **OR 支持**：现在可以实现隐式串联版本无法实现的 OR 操作。
+- ✅ **自文档化**：意图从配置中清晰可见。
+- ✅ **性能优化**：所有过滤器在一次批处理中计算统计信息，避免中间数据重组。
+
+### Comparison with FusedFilter 与 FusedFilter 的对比
+
+**FusedFilter (Automatic Fusion) 自动融合**:
+- Framework automatically fuses consecutive filters that share intermediate variables
+- Optimized for performance (reduces memory and speeds up processing)
+- Default logical operation is AND
+- User cannot explicitly control which filters are fused or the logical operation
+- Best for: Performance optimization when filters share intermediate variables
+
+- 框架自动融合共享中间变量的连续过滤器
+- 针对性能优化（减少内存并加速处理）
+- 默认逻辑运算是 AND
+- 用户无法显式控制哪些过滤器被融合或逻辑运算
+- 适用于：共享中间变量的过滤器性能优化
+
+**Combined Logical Filter (Explicit Control) 显式控制**:
+- User explicitly specifies which filters to combine
+- Supports both AND and OR operations
+- User has full control over the logical relationship
+- Works with any filters, regardless of intermediate variables
+- Best for: When you need OR operations or explicit control over filter combinations
+
+- 用户显式指定要组合的过滤器
+- 支持 AND 和 OR 操作
+- 用户完全控制逻辑关系
+- 适用于任何过滤器，不依赖中间变量
+- 适用于：需要 OR 操作或对过滤器组合进行显式控制的场景
+
+### When to Use Each 何时使用哪个
+
+| Scenario 场景 | Recommended Approach 推荐方式 | Reason 原因 |
+|--------------|---------------------------|-----------|
+| Multiple filters with AND logic, sharing intermediate vars | FusedFilter (automatic) | Performance optimization |
+| 多个过滤器使用 AND 逻辑，共享中间变量 | FusedFilter（自动） | 性能优化 |
+| Need OR operation | combined_logical_filter | Only way to implement OR |
+| 需要 OR 操作 | combined_logical_filter | 实现 OR 的唯一方式 |
+| Explicit control over filter combination | combined_logical_filter | Full user control |
+| 对过滤器组合的显式控制 | combined_logical_filter | 完全的用户控制 |
+| Complex filter logic | combined_logical_filter | Clear and self-documenting |
+| 复杂的过滤器逻辑 | combined_logical_filter | 清晰且自文档化 |
+
+## 🚀 Performance Advantages 性能优势
+
+Compared to sequential filters, `combined_logical_filter` provides several performance benefits:
+
+与串联过滤器相比，`combined_logical_filter` 提供以下性能优势：
+
+### 1. Optimized Statistics Computation 优化的统计信息计算
+
+**Sequential Filters 串联过滤器**:
+```
+Dataset → Filter1.compute_stats → Filter1.process → Reduced Dataset1
+       → Filter2.compute_stats → Filter2.process → Reduced Dataset2
+```
+- Each filter computes stats independently
+- Dataset is reduced after each filter
+- Multiple data reorganizations required
+- May duplicate some statistics computation
+
+- 每个过滤器独立计算统计信息
+- 每个过滤器后数据集被缩减
+- 需要多次数据重组
+- 可能重复某些统计信息计算
+
+**Combined Logical Filter 组合逻辑过滤器**:
+```
+Dataset → All Filters.compute_stats (single batch)
+       → All Filters.process (parallel evaluation)
+       → Logical combination → Final Dataset
+```
+- All statistics computed in one batch pass
+- No intermediate dataset reduction
+- Single data reorganization
+- Shared statistics computation
+
+- 所有统计信息在一次批处理中计算完成
+- 无中间数据集缩减
+- 单次数据重组
+- 共享统计信息计算
+
+### 2. Batch Processing Optimization 批处理优化
+
+- All filters' `process_batched` methods execute on the same batch
+- Uses numpy's vectorized `logical_and`/`logical_or` operations
+- Avoids multiple data passes and reorganizations
+- Better memory locality and cache efficiency
+
+- 所有过滤器的 `process_batched` 方法在同一批数据上执行
+- 使用 numpy 的向量化 `logical_and`/`logical_or` 操作
+- 避免多次数据传递和重组
+- 更好的内存局部性和缓存效率
+
+### 3. OR Operation  OR 操作 
+
+**This is the most important advantage**: OR operations were completely impossible with sequential filters. Now you can:
+
+**这是最重要的优势**：OR 操作在串联过滤器中完全无法实现。现在您可以：
+
+- Keep samples that satisfy **any** of multiple conditions
+- Implement complex filtering logic (e.g., "short text OR long text, but not medium")
+- Combine different filter types with OR logic
+
+- 保留满足**任一**多个条件的样本
+- 实现复杂的过滤逻辑（例如，"短文本或长文本，但不是中等长度"）
+- 将不同类型的过滤器与 OR 逻辑组合
+
+## ⚠️ Notes 注意事项
+
+1. **Filter Type Requirement**: All operators in `filter_ops` must be Filter instances. Other operator types (Mapper, etc.) are not supported.
+
+   **过滤器类型要求**：`filter_ops` 中的所有算子必须是 Filter 实例。不支持其他算子类型（Mapper 等）。
+
+2. **Processing Modes**: The operator automatically handles both batched and single-sample processing modes. It will fall back to single-sample processing for non-batched operators.
+
+   **处理模式**：该算子自动处理批处理和单样本处理模式。对于非批处理算子，它将回退到单样本处理。
+
+3. **Stats Computation**: All filters compute their stats before processing. The operator ensures this by calling `compute_stats_batched` for all filters first, which is more efficient than sequential computation.
+
+   **统计信息计算**：所有过滤器在处理前计算其统计信息。该算子通过首先为所有过滤器调用 `compute_stats_batched` 来确保这一点，这比顺序计算更高效。
+
+4. **Ray Compatibility**: This operator is compatible with Ray executor. It properly handles CUDA-accelerated filters and context variables.
+
+   **Ray 兼容性**：该算子兼容 Ray 执行器。它正确处理 CUDA 加速的过滤器和上下文变量。
+
+5. **Performance**: For best performance, use batched filter operators. The operator will automatically optimize batch processing.
+
+   **性能**：为了获得最佳性能，请使用批处理过滤器算子。该算子将自动优化批处理。
+
+## 🔄 Ray Mode Compatibility Ray 模式兼容性
+
+This operator is compatible with Ray executor:
+
+该算子兼容 Ray 执行器：
+
+- ✅ Supports batched processing in Ray mode
+- ✅ Handles CUDA-accelerated filters correctly
+- ✅ Properly manages context variables for intermediate operations
+- ✅ Works with Ray's distributed processing
+
+- ✅ 在 Ray 模式下支持批处理
+- ✅ 正确处理 CUDA 加速的过滤器
+- ✅ 正确管理中间操作的上下文变量
+- ✅ 与 Ray 的分布式处理兼容
+
+## 📝 Real-World Use Cases 实际使用场景
+
+### Use Case 1: OR Operation 场景1：OR 操作
+
+**Scenario**: Keep samples that are either short (10-50 chars) or long (100-200 chars), but exclude medium-length text.
+
+**场景**：保留短文本（10-50 字符）或长文本（100-200 字符），但排除中等长度文本。
+
+```yaml
+process:
+  - combined_logical_filter:
+      filter_ops:
+        - text_length_filter: {min_len: 10, max_len: 50}
+        - text_length_filter: {min_len: 100, max_len: 200}
+      logical_op: 'or'
+```
+
+**Why this was impossible before**: Sequential filters can only implement AND logic. The first filter would remove all samples outside its range, leaving nothing for the second filter to process.
+
+**为什么Default Sequential Version 无法实现**：串联过滤器只能实现 AND 逻辑。第一个过滤器会移除其范围外的所有样本，第二个过滤器无法处理。
+
+### Use Case 2: Multi-Criteria Filtering 场景2：多条件过滤
+
+**Scenario**: Keep samples that are in Chinese OR have high quality scores.
+
+**场景**：保留中文样本或高质量得分样本。
+
+```yaml
+process:
+  - combined_logical_filter:
+      filter_ops:
+        - language_id_score_filter: {lang: 'zh', min_score: 0.8}
+        - llm_quality_score_filter: {min_score: 0.9}
+      logical_op: 'or'
+```
+
+### Use Case 3: Performance Optimization 场景3：性能优化
+
+**Scenario**: Multiple filters with AND logic, but you want explicit control and better performance than sequential filters.
+
+**场景**：多个过滤器使用 AND 逻辑，但您想要显式控制和比串联过滤器更好的性能。
+
+```yaml
+process:
+  - combined_logical_filter:
+      filter_ops:
+        - text_length_filter: {min_len: 10, max_len: 1000}
+        - language_id_score_filter: {lang: 'zh', min_score: 0.8}
+        - alphanumeric_filter: {min_ratio: 0.5}
+      logical_op: 'and'
+```
+
+**Benefits**: All statistics computed in one pass, no intermediate data reduction, better performance.
+
+**优势**：所有统计信息在一次遍历中计算，无中间数据缩减，性能更好。
+
+## 🚀 Future Enhancements 未来增强
+
+Potential improvements for future versions:
+
+未来版本的潜在改进：
+
+1. **Nested Logical Operations**: Support for complex nested logical expressions (e.g., (A AND B) OR (C AND D))
+
+   **嵌套逻辑运算**：支持复杂的嵌套逻辑表达式（例如，(A AND B) OR (C AND D)）
+
+2. **Performance Optimization**: Further optimization for large-scale batch processing
+
+   **性能优化**：进一步优化大规模批处理
+
+3. **Integration with FusedFilter**: Automatic detection and optimization when combined with framework's automatic fusion
+
+   **与 FusedFilter 集成**：与框架的自动融合结合时自动检测和优化
+
+4. **Validation**: Enhanced validation and error messages for invalid configurations
+
+   **验证**：增强的验证和错误消息，用于无效配置
+
+## 🔗 related links 相关链接
+- [source code 源代码](../../../data_juicer/ops/filter/combined_logical_filter.py)
+- [unit test 单元测试](../../../tests/ops/filter/test_combined_logical_filter.py)
+- [Return operator list 返回算子列表](../../Operators.md)
diff --git a/tests/ops/filter/test_combined_logical_filter.py b/tests/ops/filter/test_combined_logical_filter.py
new file mode 100644
index 00000000000..9e86a00f85a
--- /dev/null
+++ b/tests/ops/filter/test_combined_logical_filter.py
@@ -0,0 +1,177 @@
+import unittest
+
+from data_juicer.core.data import NestedDataset as Dataset
+from data_juicer.ops.filter.combined_logical_filter import CombinedLogicalFilter
+from data_juicer.utils.constant import Fields
+from data_juicer.utils.unittest_utils import DataJuicerTestCaseBase
+
+
+class CombinedLogicalFilterTest(DataJuicerTestCaseBase):
+
+    def _run_combined_logical_filter(self, dataset: Dataset, target_list, op):
+        if Fields.stats not in dataset.features:
+            dataset = dataset.add_column(name=Fields.stats, column=[{}] * dataset.num_rows)
+        dataset = dataset.map(op.compute_stats, batch_size=3)
+        dataset = dataset.filter(op.process, batch_size=2)
+        dataset = dataset.select_columns(column_names=['text'])
+        res_list = dataset.to_list()
+        self.assertEqual(res_list, target_list)
+
+    def test_and_operation(self):
+        """Test AND operation: sample must satisfy both filters."""
+        ds_list = [
+            {'text': 'short'},  # Too short, filtered by first filter
+            {'text': 'This is a medium length text that should pass both filters'},  # Passes both
+            {'text': 'This is a very long text that exceeds the maximum length limit and should be filtered'},  # Too long, filtered by first filter
+        ]
+        target_list = [
+            {'text': 'This is a medium length text that should pass both filters'}
+        ]
+        dataset = Dataset.from_list(ds_list)
+        op = CombinedLogicalFilter(
+            filter_ops=[
+                {"text_length_filter": {"min_len": 20, "max_len": 100}},
+                {"text_length_filter": {"min_len": 10, "max_len": 200}}
+            ],
+            logical_op="and"
+        )
+        self._run_combined_logical_filter(dataset, target_list, op)
+
+    def test_or_operation(self):
+        """Test OR operation: sample must satisfy at least one filter."""
+        ds_list = [
+            {'text': 'short'},  # Too short for first filter, but passes second
+            {'text': 'This is a medium length text'},  # Passes both
+            {'text': 'This is a very long text that exceeds the maximum length limit'},  # Too long for first, but passes second
+            {'text': 'x'},  # Too short for both, filtered
+        ]
+        target_list = [
+            {'text': 'short'},
+            {'text': 'This is a medium length text'},
+            {'text': 'This is a very long text that exceeds the maximum length limit'}
+        ]
+        dataset = Dataset.from_list(ds_list)
+        op = CombinedLogicalFilter(
+            filter_ops=[
+                {"text_length_filter": {"min_len": 20, "max_len": 50}},
+                {"text_length_filter": {"min_len": 5, "max_len": 200}}
+            ],
+            logical_op="or"
+        )
+        self._run_combined_logical_filter(dataset, target_list, op)
+
+    def test_single_filter(self):
+        """Test with a single filter (should work like a normal filter)."""
+        ds_list = [
+            {'text': 'short'},
+            {'text': 'This is a medium length text'},
+            {'text': 'This is a very long text that exceeds the maximum length limit'},
+        ]
+        target_list = [
+            {'text': 'This is a medium length text'}
+        ]
+        dataset = Dataset.from_list(ds_list)
+        op = CombinedLogicalFilter(
+            filter_ops=[
+                {"text_length_filter": {"min_len": 20, "max_len": 50}}
+            ],
+            logical_op="and"
+        )
+        self._run_combined_logical_filter(dataset, target_list, op)
+
+    def test_default_and(self):
+        """Test that default logical_op is 'and'."""
+        ds_list = [
+            {'text': 'short'},
+            {'text': 'This is a medium length text'},
+        ]
+        target_list = [
+            {'text': 'This is a medium length text'}
+        ]
+        dataset = Dataset.from_list(ds_list)
+        # Don't specify logical_op, should default to "and"
+        op = CombinedLogicalFilter(
+            filter_ops=[
+                {"text_length_filter": {"min_len": 20, "max_len": 50}}
+            ]
+        )
+        self._run_combined_logical_filter(dataset, target_list, op)
+
+    def test_empty_filter_ops(self):
+        """Test that empty filter_ops raises ValueError."""
+        with self.assertRaises(ValueError) as context:
+            CombinedLogicalFilter(filter_ops=[])
+        self.assertIn("cannot be empty", str(context.exception))
+
+    def test_invalid_logical_op(self):
+        """Test that invalid logical_op raises ValueError."""
+        with self.assertRaises(ValueError) as context:
+            CombinedLogicalFilter(
+                filter_ops=[{"text_length_filter": {"min_len": 10}}],
+                logical_op="xor"
+            )
+        self.assertIn("must be 'and' or 'or'", str(context.exception))
+
+    def test_case_insensitive_logical_op(self):
+        """Test that logical_op is case-insensitive."""
+        ds_list = [
+            {'text': 'short'},
+            {'text': 'This is a medium length text'},
+        ]
+        target_list = [
+            {'text': 'This is a medium length text'}
+        ]
+        dataset = Dataset.from_list(ds_list)
+        # Test uppercase
+        op = CombinedLogicalFilter(
+            filter_ops=[{"text_length_filter": {"min_len": 20, "max_len": 50}}],
+            logical_op="AND"
+        )
+        self._run_combined_logical_filter(dataset, target_list, op)
+
+    def test_multiple_filters_and(self):
+        """Test AND operation with multiple filters."""
+        ds_list = [
+            {'text': 'short'},  # Fails length check
+            {'text': 'This is a medium length text'},  # Passes all
+            {'text': 'This is a very long text that exceeds the maximum length limit'},  # Fails length check
+        ]
+        target_list = [
+            {'text': 'This is a medium length text'}
+        ]
+        dataset = Dataset.from_list(ds_list)
+        op = CombinedLogicalFilter(
+            filter_ops=[
+                {"text_length_filter": {"min_len": 20, "max_len": 100}},
+                {"text_length_filter": {"min_len": 10, "max_len": 200}},
+                {"text_length_filter": {"min_len": 15, "max_len": 150}}
+            ],
+            logical_op="and"
+        )
+        self._run_combined_logical_filter(dataset, target_list, op)
+
+    def test_multiple_filters_or(self):
+        """Test OR operation with multiple filters."""
+        ds_list = [
+            {'text': 'short'},  # Fails all
+            {'text': 'This is a medium length text'},  # Passes all
+            {'text': 'This is a very long text that exceeds the maximum length limit'},  # Passes some
+        ]
+        target_list = [
+            {'text': 'This is a medium length text'},
+            {'text': 'This is a very long text that exceeds the maximum length limit'}
+        ]
+        dataset = Dataset.from_list(ds_list)
+        op = CombinedLogicalFilter(
+            filter_ops=[
+                {"text_length_filter": {"min_len": 20, "max_len": 50}},  # medium passes, long fails
+                {"text_length_filter": {"min_len": 5, "max_len": 200}},  # all pass
+                {"text_length_filter": {"min_len": 100, "max_len": 300}}  # only long passes
+            ],
+            logical_op="or"
+        )
+        self._run_combined_logical_filter(dataset, target_list, op)
+
+
+if __name__ == '__main__':
+    unittest.main()