🧮 Optimizers ¶

evoagentx.optimizers ¶

SEWOptimizer ¶

SEWOptimizer(**kwargs)

Bases: Optimizer

Source code in evoagentx/core/module.py

def __init__(self, **kwargs):
    """
    Initializes a BaseModule instance.

    Args:
        **kwargs (Any): Keyword arguments used to initialize the instance

    Raises:
        ValidationError: When parameter validation fails
        Exception: When other errors occur during initialization
    """

    try:
        for field_name, _ in type(self).model_fields.items():
            field_value = kwargs.get(field_name, None)
            if field_value:
                kwargs[field_name] = self._process_data(field_value)
            # if field_value and isinstance(field_value, dict) and "class_name" in field_value:
            #     class_name = field_value.get("class_name")
            #     sub_cls = MODULE_REGISTRY.get_module(cls_name=class_name)
            #     kwargs[field_name] = sub_cls._create_instance(field_value)
        super().__init__(**kwargs) 
        self.init_module()
    except (ValidationError, Exception) as e:
        exception_handler = callback_manager.get_callback("exception_buffer")
        if exception_handler is None:
            error_message = get_base_module_init_error_message(
                cls=self.__class__, 
                data=kwargs, 
                errors=e
            )
            logger.error(error_message)
            raise
        else:
            exception_handler.add(e)

step ¶

step(**kwargs) -> Union[SequentialWorkFlowGraph, ActionGraph]

Take a step of optimization and return the optimized graph.

Source code in evoagentx/optimizers/sew_optimizer.py

def step(self, **kwargs) -> Union[SequentialWorkFlowGraph, ActionGraph]:
    """
    Take a step of optimization and return the optimized graph.
    """
    graph = self._select_graph_with_highest_score(return_metrics=False)
    if isinstance(graph, SequentialWorkFlowGraph):
        new_graph = self._workflow_graph_step(graph)
    elif isinstance(graph, ActionGraph):
        new_graph = self._action_graph_step(graph)
    else:
        raise ValueError(f"Invalid graph type: {type(graph)}. The graph should be an instance of `WorkFlowGraph` or `ActionGraph`.")
    return new_graph

evaluate ¶

evaluate(dataset: Benchmark, eval_mode: str = 'test', graph: Optional[Union[SequentialWorkFlowGraph, ActionGraph]] = None, indices: Optional[List[int]] = None, sample_k: Optional[int] = None, **kwargs) -> dict

Evaluate the workflow. If graph is provided, use the provided graph for evaluation. Otherwise, use the graph in the optimizer.

Parameters:

Name	Type	Description	Default
`dataset`	`Benchmark`	The dataset to evaluate the workflow on.	required
`eval_mode`	`str`	The evaluation mode. Choices: ["test", "dev", "train"].	`'test'`
`graph`	`Union[WorkFlowGraph, ActionGraph]`	The graph to evaluate. If not provided, use the graph in the optimizer.	`None`
`indices`	`List[int]`	The indices of the data to evaluate the workflow on.	`None`
`sample_k`	`int`	The number of data to evaluate the workflow on. If provided, a random sample of size `sample_k` will be used.	`None`

Returns:

Name	Type	Description
`dict`	`dict`	The metrics of the workflow evaluation.

Source code in evoagentx/optimizers/sew_optimizer.py

def evaluate(
    self, 
    dataset: Benchmark, 
    eval_mode: str = "test", 
    graph: Optional[Union[SequentialWorkFlowGraph, ActionGraph]] = None,
    indices: Optional[List[int]] = None,
    sample_k: Optional[int] = None,
    **kwargs
) -> dict:
    """
    Evaluate the workflow. If `graph` is provided, use the provided graph for evaluation. Otherwise, use the graph in the optimizer. 

    Args:
        dataset (Benchmark): The dataset to evaluate the workflow on.
        eval_mode (str): The evaluation mode. Choices: ["test", "dev", "train"].
        graph (Union[WorkFlowGraph, ActionGraph], optional): The graph to evaluate. If not provided, use the graph in the optimizer.
        indices (List[int], optional): The indices of the data to evaluate the workflow on.
        sample_k (int, optional): The number of data to evaluate the workflow on. If provided, a random sample of size `sample_k` will be used.

    Returns:
        dict: The metrics of the workflow evaluation.
    """
    graph = graph if graph is not None else self.graph
    metrics_list = []
    for i in range(self.eval_rounds):
        eval_info = [
            f"[{type(graph).__name__}]", 
            f"Evaluation round {i+1}/{self.eval_rounds}", 
            f"Mode: {eval_mode}"
        ]
        if indices is not None:
            eval_info.append(f"Indices: {len(indices)} samples")
        if sample_k is not None:
            eval_info.append(f"Sample size: {sample_k}")
        logger.info(" | ".join(eval_info))
        metrics = self.evaluator.evaluate(
            graph=graph, 
            benchmark=dataset, 
            eval_mode=eval_mode, 
            indices=indices, 
            sample_k=sample_k,
            **kwargs
        )
        metrics_list.append(metrics)
    avg_metrics = self.evaluator._calculate_average_score(metrics_list)

    return avg_metrics

save ¶

save(path: str, ignore: List[str] = [])

Save the (optimized) workflow graph to a file.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to save the workflow graph.	required
`ignore`	`List[str]`	The keys to ignore when saving the workflow graph.	`[]`

Source code in evoagentx/optimizers/sew_optimizer.py

def save(self, path: str, ignore: List[str] = []):
    """
    Save the (optimized) workflow graph to a file. 

    Args:
        path (str): The path to save the workflow graph.
        ignore (List[str]): The keys to ignore when saving the workflow graph.
    """
    self.graph.save_module(path, ignore=ignore)

AFlowOptimizer ¶

AFlowOptimizer(**kwargs)

Bases: BaseModule

AFlow Optimizer for workflow optimization.

This optimizer iteratively improves workflows through multiple rounds of optimization using large language models. It evaluates workflow performance, identifies improvement opportunities, and applies optimizations based on experience and convergence metrics.

Attributes:

Name	Type	Description
`question_type`	`str`	Type of task to optimize for (e.g., qa, match, code)
`graph_path`	`str`	Path to the workflow graph directory (must contain graph.py and prompt.py)
`optimized_path`	`str`	Path to save optimized workflows (defaults to graph_path)
`initial_round`	`int`	Starting round number for optimization
`optimizer_llm`	`BaseLLM`	LLM used for generating optimizations
`executor_llm`	`BaseLLM`	LLM used for executing the workflow
`operators`	`List[str]`	List of operators available for optimization
`sample`	`int`	Number of rounds to sample from for optimization
`max_rounds`	`int`	Maximum number of optimization rounds to perform
`validation_rounds`	`int`	Number of validation runs per optimization round
`eval_rounds`	`int`	Number of evaluation runs for test mode
`check_convergence`	`bool`	Whether to check for optimization convergence

Source code in evoagentx/core/module.py

def __init__(self, **kwargs):
    """
    Initializes a BaseModule instance.

    Args:
        **kwargs (Any): Keyword arguments used to initialize the instance

    Raises:
        ValidationError: When parameter validation fails
        Exception: When other errors occur during initialization
    """

    try:
        for field_name, _ in type(self).model_fields.items():
            field_value = kwargs.get(field_name, None)
            if field_value:
                kwargs[field_name] = self._process_data(field_value)
            # if field_value and isinstance(field_value, dict) and "class_name" in field_value:
            #     class_name = field_value.get("class_name")
            #     sub_cls = MODULE_REGISTRY.get_module(cls_name=class_name)
            #     kwargs[field_name] = sub_cls._create_instance(field_value)
        super().__init__(**kwargs) 
        self.init_module()
    except (ValidationError, Exception) as e:
        exception_handler = callback_manager.get_callback("exception_buffer")
        if exception_handler is None:
            error_message = get_base_module_init_error_message(
                cls=self.__class__, 
                data=kwargs, 
                errors=e
            )
            logger.error(error_message)
            raise
        else:
            exception_handler.add(e)

optimize ¶

optimize(benchmark: Benchmark)

Run the optimization process on the workflow.

Performs multiple rounds of optimization, evaluating each round against the benchmark and checking for convergence. Continues until convergence is detected or the maximum number of rounds is reached.

Parameters:

Name	Type	Description	Default
`benchmark`	`Benchmark`	The benchmark to evaluate the workflow against	required

Source code in evoagentx/optimizers/aflow_optimizer.py

def optimize(self, benchmark: Benchmark):
    """Run the optimization process on the workflow.

    Performs multiple rounds of optimization, evaluating each round against
    the benchmark and checking for convergence. Continues until convergence
    is detected or the maximum number of rounds is reached.

    Args:
        benchmark: The benchmark to evaluate the workflow against
    """
    self.benchmark = benchmark
    for _ in range(self.max_rounds):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        score = loop.run_until_complete(self._execute_with_retry(self._optimize_graph))
        self.round += 1
        logger.info(f"Score for round {self.round}: {score}")
        if self._check_convergence():
            break
        if self.round >= self.max_rounds:
            logger.info(f"Max rounds reached: {self.max_rounds}, stopping optimization.")
            break

test ¶

test(benchmark: Benchmark, test_rounds: List[int] = None)

Run the test evaluation on optimized workflows.

Evaluates specified rounds (or the best round if none specified) against the benchmark multiple times and logs the results.

Parameters:

Name	Type	Description	Default
`benchmark`	`Benchmark`	The benchmark to evaluate against	required
`test_rounds`	`List[int]`	Specific round numbers to test, or None to use the best round	`None`

Source code in evoagentx/optimizers/aflow_optimizer.py

def test(self, benchmark: Benchmark, test_rounds: List[int] = None):
    """Run the test evaluation on optimized workflows.

    Evaluates specified rounds (or the best round if none specified) against
    the benchmark multiple times and logs the results.

    Args:
        benchmark: The benchmark to evaluate against
        test_rounds: Specific round numbers to test, or None to use the best round
    """
    self.benchmark = benchmark
    if test_rounds is None:
        best_round = self._load_best_round()
        logger.info(f"No test rounds provided, using best round: {best_round}")
        test_rounds = [best_round]
    for _ in tqdm(range(self.eval_rounds)):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self._run_test(test_rounds))

TextGradOptimizer ¶

TextGradOptimizer(**kwargs)

Bases: BaseModule

Uses TextGrad to optimize agents' system prompts and instructions in a multi-agent workflow. For more information on TextGrad, see https://github.com/zou-group/textgrad.

Source code in evoagentx/core/module.py

def __init__(self, **kwargs):
    """
    Initializes a BaseModule instance.

    Args:
        **kwargs (Any): Keyword arguments used to initialize the instance

    Raises:
        ValidationError: When parameter validation fails
        Exception: When other errors occur during initialization
    """

    try:
        for field_name, _ in type(self).model_fields.items():
            field_value = kwargs.get(field_name, None)
            if field_value:
                kwargs[field_name] = self._process_data(field_value)
            # if field_value and isinstance(field_value, dict) and "class_name" in field_value:
            #     class_name = field_value.get("class_name")
            #     sub_cls = MODULE_REGISTRY.get_module(cls_name=class_name)
            #     kwargs[field_name] = sub_cls._create_instance(field_value)
        super().__init__(**kwargs) 
        self.init_module()
    except (ValidationError, Exception) as e:
        exception_handler = callback_manager.get_callback("exception_buffer")
        if exception_handler is None:
            error_message = get_base_module_init_error_message(
                cls=self.__class__, 
                data=kwargs, 
                errors=e
            )
            logger.error(error_message)
            raise
        else:
            exception_handler.add(e)

optimize ¶

optimize(dataset: Benchmark, use_answers: bool = True, seed: Optional[int] = None) -> None

Optimizes self.graph using dataset.

Parameters:

Name	Type	Description	Default
`dataset`	`Benchmark`	The dataset to use for optimization.	required
`use_answers`	`bool`	Whether to use the answers (labels) in the training set for optimization. If False, `dataset`'s training set does not need to have answers. If `eval_every_n_steps` is set to None, we can optimize the workflow without any labeled data.	`True`
`seed`	`Optional[int]`	The random seed to use for shuffling the data.	`None`

Source code in evoagentx/optimizers/textgrad_optimizer.py

def optimize(self, dataset: Benchmark, use_answers: bool = True, seed: Optional[int] = None) -> None:
    """Optimizes self.graph using `dataset`.

    Args:
        dataset (Benchmark): The dataset to use for optimization.
        use_answers (bool): Whether to use the answers (labels) in the training set for optimization.
            If False, `dataset`'s training set does not need to have answers.
            If `eval_every_n_steps` is set to None, we can optimize the workflow without any labeled data.
        seed (Optional[int]): The random seed to use for shuffling the data.
    """
    self._init_textgrad(dataset, use_answers)

    def iterator() -> Iterator[Tuple[List[dict[str, str]],  Optional[List[Union[str, dict[str, str]]]]]]:
        epoch = 0
        while True:
            # Shuffle train data every epoch
            effective_seed = seed + epoch if seed is not None else None
            train_data = dataset.get_train_data(sample_k=len(dataset._train_data), seed=effective_seed)
            for i in range(0, len(train_data), self.batch_size):
                batch = train_data[i:i + self.batch_size]
                inputs = [self.evaluator.collate_func(x) for x in batch]
                if use_answers:
                    labels = dataset.get_labels(batch)
                else:
                    labels = None
                yield inputs, labels
            epoch += 1

    data_iterator = iterator()

    for step in tqdm(range(self.max_steps)):
        inputs, labels = next(data_iterator)
        self.step(inputs, labels, dataset, use_answers)

        if self.eval_every_n_steps is not None and (step + 1) % self.eval_every_n_steps == 0:
            logger.info(f"Evaluating the workflow at step {step+1} ...")
            with suppress_logger_info():
                metrics = self.evaluate(dataset, **self.eval_config)
            self.log_snapshot(self.graph, metrics)
            logger.info(f"Step {step+1} metrics: {metrics}")

            # If rollback is enabled, keep track of the best snapshot
            if self.rollback:
                if len(self._snapshot) == 1:
                    best_snapshot = self._snapshot[-1]
                    best_average_score = np.mean(list(metrics.values()))
                else:
                    current_average_score = np.mean(list(metrics.values()))

                    if current_average_score >= best_average_score:
                        # If the current average score is better than the best average score, update the best snapshot
                        best_snapshot = self._snapshot[-1]
                        best_average_score = current_average_score
                    else:
                        # If the current average score is worse than the best average score, roll back to the best snapshot
                        logger.info(f"Metrics are worse than the best snapshot which has {best_snapshot['metrics']}. Rolling back to the best snapshot.")
                        best_graph = WorkFlowGraph.from_dict(best_snapshot["graph"])
                        self.graph = best_graph
                        self._create_textgrad_agents()

        if self.save_interval is not None and (step + 1) % self.save_interval == 0:
            logger.info(f"Saving the workflow at step {step+1} ...")
            self.save(os.path.join(self.save_path, f"{dataset.name}_textgrad_step_{step+1}.json"))

    logger.info(f"Reached the maximum number of steps {self.max_steps}. Optimization has finished.")
    self.save(os.path.join(self.save_path, f"{dataset.name}_textgrad_final.json"))

    # Saves the best graph
    if len(self._snapshot) > 0:
        best_graph = self._select_graph_with_highest_score()
        self.save(os.path.join(self.save_path, f"{dataset.name}_textgrad_best.json"), graph=best_graph)

step ¶

step(inputs: list[dict[str, str]], labels: Optional[list[Union[str, dict[str, str]]]], dataset: Benchmark, use_answers: bool = True) -> None

Performs one optimization step using a batch of data.

Source code in evoagentx/optimizers/textgrad_optimizer.py

def step(
    self, 
    inputs: list[dict[str, str]], 
    labels: Optional[list[Union[str, dict[str, str]]]], 
    dataset: Benchmark, 
    use_answers: bool = True
) -> None:
    """Performs one optimization step using a batch of data."""

    losses = []
    logger.info("Executing workflow...")

    if use_answers:
        if labels is None:
            raise ValueError("Labels must be provided if `use_answers` is True.")

        for input, label in zip(inputs, labels, strict=True):
            output = self.forward(input)
            if isinstance(label, str):
                label = Variable(label, requires_grad=False, role_description="correct answer for the query")
            elif isinstance(label, dict):
                if not isinstance(dataset, CodingBenchmark):
                    raise ValueError("Label must be a string for non-coding benchmarks.")
                end_node_name = self.graph.find_end_nodes()[0]
                end_node = self.graph.get_node(end_node_name)
                output_name = end_node.outputs[0].name
                code = output.parsed_outputs[output_name]
                label = self._format_code_label(code, label, dataset)
                label = Variable(label, requires_grad=False, role_description="the task, the test result, and the correct code")
            loss = self.loss_fn([output, label])
            losses.append(loss)
    else:
        for input in inputs:
            output = self.forward(input)
            loss = self.loss_fn(output)
            losses.append(loss)

    total_loss = tg.sum(losses)
    logger.info("Computing gradients...")
    total_loss.backward(self.optimizer_engine)
    logger.info("Updating agents...")
    self.textgrad_optimizer.step()
    self.textgrad_optimizer.zero_grad()
    self._update_workflow_graph()
    logger.info("Agents updated")

forward ¶

forward(inputs: dict[str, str]) -> Variable

Returns the final output from the workflow.

Source code in evoagentx/optimizers/textgrad_optimizer.py

def forward(self, inputs: dict[str, str]) -> Variable:
    """Returns the final output from the workflow."""
    self._visited_nodes = set()
    end_node = self.graph.find_end_nodes()[0]
    input_variables = self._initial_inputs_to_variables(inputs)
    output = self._compute_node(end_node, input_variables)
    return output

evaluate ¶

evaluate(dataset: Benchmark, eval_mode: str = 'dev', graph: Optional[WorkFlowGraph] = None, indices: Optional[List[int]] = None, sample_k: Optional[int] = None, **kwargs) -> dict

Evaluate the workflow. If graph is provided, use the provided graph for evaluation. Otherwise, use the graph in the optimizer.

Parameters:

Name	Type	Description	Default
`dataset`	`Benchmark`	The dataset to evaluate the workflow on.	required
`eval_mode`	`str`	The evaluation mode. Choices: ["test", "dev", "train"].	`'dev'`
`graph`	`WorkFlowGraph`	The graph to evaluate. If not provided, use the graph in the optimizer.	`None`
`indices`	`List[int]`	The indices of the data to evaluate the workflow on.	`None`
`sample_k`	`int`	The number of data to evaluate the workflow on. If provided, a random sample of size `sample_k` will be used.	`None`

Returns:

Name	Type	Description
`dict`	`dict`	The metrics of the workflow evaluation.

Source code in evoagentx/optimizers/textgrad_optimizer.py

def evaluate(
    self, 
    dataset: Benchmark, 
    eval_mode: str = "dev", 
    graph: Optional[WorkFlowGraph] = None,
    indices: Optional[List[int]] = None,
    sample_k: Optional[int] = None,
    **kwargs
) -> dict:
    """Evaluate the workflow. If `graph` is provided, use the provided graph for evaluation. Otherwise, use the graph in the optimizer. 

    Args:
        dataset (Benchmark): The dataset to evaluate the workflow on.
        eval_mode (str): The evaluation mode. Choices: ["test", "dev", "train"].
        graph (WorkFlowGraph, optional): The graph to evaluate. If not provided, use the graph in the optimizer.
        indices (List[int], optional): The indices of the data to evaluate the workflow on.
        sample_k (int, optional): The number of data to evaluate the workflow on. If provided, a random sample of size `sample_k` will be used.

    Returns:
        dict: The metrics of the workflow evaluation.
    """
    if graph is None:
        graph = self.graph

    metrics_list = []
    for i in range(self.eval_rounds):
        eval_info = [
            f"[{type(graph).__name__}]", 
            f"Evaluation round {i+1}/{self.eval_rounds}", 
            f"Mode: {eval_mode}"
        ]
        if indices is not None:
            eval_info.append(f"Indices: {len(indices)} samples")
        if sample_k is not None:
            eval_info.append(f"Sample size: {sample_k}")
        logger.info(" | ".join(eval_info))
        metrics = self.evaluator.evaluate(
            graph=graph, 
            benchmark=dataset, 
            eval_mode=eval_mode, 
            indices=indices, 
            sample_k=sample_k,
            update_agents=True, 
            **kwargs
        )
        metrics_list.append(metrics)
    avg_metrics = self.evaluator._calculate_average_score(metrics_list)

    return avg_metrics

save ¶

save(path: str, graph: Optional[WorkFlowGraph] = None, ignore: List[str] = []) -> None

Save the workflow graph containing the optimized prompts to a file.

Parameters:

Name	Type	Description	Default
`path`	`str`	The path to save the workflow graph.	required
`graph`	`WorkFlowGraph`	The graph to save. If not provided, use the graph in the optimizer.	`None`
`ignore`	`List[str]`	The keys to ignore when saving the workflow graph.	`[]`

Source code in evoagentx/optimizers/textgrad_optimizer.py

def save(self, path: str, graph: Optional[WorkFlowGraph] = None, ignore: List[str] = []) -> None:
    """Save the workflow graph containing the optimized prompts to a file. 

    Args:
        path (str): The path to save the workflow graph.
        graph (WorkFlowGraph, optional): The graph to save. If not provided, use the graph in the optimizer.
        ignore (List[str]): The keys to ignore when saving the workflow graph.
    """
    if graph is None:
        graph = self.graph
    graph.save_module(path, ignore=ignore)

log_snapshot ¶

log_snapshot(graph: WorkFlowGraph, metrics: dict) -> None

Log the snapshot of the workflow.

Source code in evoagentx/optimizers/textgrad_optimizer.py

def log_snapshot(self, graph: WorkFlowGraph, metrics: dict) -> None:
    """Log the snapshot of the workflow."""
    self._snapshot.append(
        {
            "index": len(self._snapshot),
            "graph": deepcopy(graph.get_config()),
            "metrics": metrics,
        }
    )

restore_best_graph ¶

restore_best_graph() -> None

Restore the best graph from the snapshot and set it to self.graph.

Source code in evoagentx/optimizers/textgrad_optimizer.py

def restore_best_graph(self) -> None:
    """Restore the best graph from the snapshot and set it to `self.graph`."""
    if len(self._snapshot) == 0:
        logger.info("No snapshot found. No graph to restore.")
        return

    best_graph, best_metrics = self._select_graph_with_highest_score(return_metrics=True)
    self.graph = best_graph
    logger.info(f"Restored the best graph from snapshot with metrics {best_metrics}")

MiproOptimizer ¶

MiproOptimizer(registry: ParamRegistry, program: Callable, optimizer_llm: BaseLLM, evaluator: Optional[Callable] = None, eval_rounds: Optional[int] = 1, metric_threshold: Optional[float] = None, max_bootstrapped_demos: int = 4, max_labeled_demos: int = 4, auto: Optional[Literal['light', 'medium', 'heavy']] = 'medium', max_steps: int = None, num_candidates: Optional[int] = None, num_threads: Optional[int] = None, max_errors: int = 10, seed: int = 9, init_temperature: float = 0.5, track_stats: bool = True, save_path: Optional[str] = None, minibatch: bool = True, minibatch_size: int = 35, minibatch_full_eval_steps: int = 5, program_aware_proposer: bool = True, data_aware_proposer: bool = True, view_data_batch_size: int = 10, tip_aware_proposer: bool = True, fewshot_aware_proposer: bool = True, requires_permission_to_run: bool = False, provide_traceback: Optional[bool] = None, verbose: bool = False, **kwargs)

Bases: BaseOptimizer, MIPROv2

Base MiproOptimizer class that supports plug-and-play usage.

Parameters:

Name	Type	Description	Default
`registry`	`ParamRegistry`	a ParamRegistry object that contains the parameters to optimize.	required
`program`	`Callable`	a program to optimize. Must be a callable object with save(path) and load(path) methods.	required
`optimizer_llm`	`BaseLLM`	a language model to use for optimization.	required
`evaluator`	`Optional[Callable]`	a function that evaluates the performance of the program. Required to have a `__call__(program, evalset, *kwargs) -> float` method that receives a program and a list of examples from a benchmark's train/dev/test set and return a float score. Must also have a `metric(example, prediction) -> float` method that evaluates a single example. If not provided, will construct a default evaluator using the benchmark's evaluate method.	`None`
`eval_rounds`	`Optional[int]`	number of rounds to evaluate the program. Defaults to 1.	`1`
`metric_threshold`	`Optional[float]`	threshold for the metric score. If provided, only examples with scores above this threshold will be used as demonstrations. If not provided, examples with scores above 0 will be used as demonstrations.	`None`
`max_bootstrapped_demos`	`int`	maximum number of bootstrapped demonstrations to use. Defaults to 4.	`4`
`max_labeled_demos`	`int`	maximum number of labeled demonstrations to use. Defaults to 4.	`4`
`auto`	`Optional[Literal['light', 'medium', 'heavy']]`	automatic configuration mode. If set, will override num_candidates and max_steps. "light": n=6, val_size=100; "medium": n=12, val_size=300; "heavy": n=18, val_size=1000. Defaults to "medium".	`'medium'`
`max_steps`	`int`	maximum number of optimization steps. Required if auto is None.	`None`
`num_candidates`	`Optional[int]`	number of candidates to generate for each optimization step. Required if auto is None.	`None`
`num_threads`	`Optional[int]`	number of threads to use for parallel evaluation. If None, will use single thread. Only used if evaluator is not provided.	`None`
`max_errors`	`int`	maximum number of errors allowed during evaluation before stopping. Defaults to 10.	`10`
`seed`	`int`	random seed for reproducibility. Defaults to 9.	`9`
`init_temperature`	`float`	initial temperature for instruction generation. Defaults to 0.5.	`0.5`
`track_stats`	`bool`	whether to track optimization statistics. Defaults to True.	`True`
`save_path`	`Optional[str]`	path to save optimization results. If None, results will not be saved.	`None`
`minibatch`	`bool`	whether to use minibatch evaluation during optimization. Defaults to True.	`True`
`minibatch_size`	`int`	size of minibatch for evaluation. Defaults to 35.	`35`
`minibatch_full_eval_steps`	`int`	number of minibatch steps between full evaluations. Defaults to 5.	`5`
`program_aware_proposer`	`bool`	whether to use program-aware instruction proposer. Defaults to True.	`True`
`data_aware_proposer`	`bool`	whether to use data-aware instruction proposer. Defaults to True.	`True`
`view_data_batch_size`	`int`	batch size for viewing data during instruction proposal. Defaults to 10.	`10`
`tip_aware_proposer`	`bool`	whether to use tip-aware instruction proposer. Defaults to True.	`True`
`fewshot_aware_proposer`	`bool`	whether to use fewshot-aware instruction proposer. Defaults to True.	`True`
`requires_permission_to_run`	`bool`	whether to require user permission before running optimization. Defaults to False.	`False`
`provide_traceback`	`Optional[bool]`	whether to provide traceback for evaluation errors. If None, will use default setting.	`None`
`**kwargs`		additional keyword arguments to pass to the evaluator.	`{}`

Raises:

Type	Description
`TypeError`	If program is not callable or evaluator doesn't return float
`ValueError`	If program doesn't have required methods (save and load) or if evaluator doesn't have required methods

Source code in evoagentx/optimizers/mipro_optimizer.py

def __init__(
    self,
    registry: ParamRegistry,
    program: Callable,
    optimizer_llm: BaseLLM,
    evaluator: Optional[Callable] = None,
    eval_rounds: Optional[int] = 1, 
    metric_threshold: Optional[float] = None,
    max_bootstrapped_demos: int = 4, 
    max_labeled_demos: int = 4, 
    auto: Optional[Literal["light", "medium", "heavy"]] = "medium", 
    max_steps: int = None, 
    num_candidates: Optional[int] = None, 
    num_threads: Optional[int] = None, 
    max_errors: int = 10, 
    seed: int = 9, 
    init_temperature: float = 0.5, 
    track_stats: bool = True, 
    save_path: Optional[str] = None,  
    minibatch: bool = True, 
    minibatch_size: int = 35, 
    minibatch_full_eval_steps: int = 5, 
    program_aware_proposer: bool = True,
    data_aware_proposer: bool = True,
    view_data_batch_size: int = 10,
    tip_aware_proposer: bool = True,
    fewshot_aware_proposer: bool = True,
    requires_permission_to_run: bool = False,
    provide_traceback: Optional[bool] = None,
    verbose: bool = False, 
    **kwargs
):
    """
    Base MiproOptimizer class that supports plug-and-play usage. 

    Args: 
        registry (ParamRegistry): a ParamRegistry object that contains the parameters to optimize. 
        program (Callable): a program to optimize. Must be a callable object with save(path) and load(path) methods.
        optimizer_llm (BaseLLM): a language model to use for optimization. 
        evaluator (Optional[Callable]): a function that evaluates the performance of the program. 
            Required to have a `__call__(program, evalset, *kwargs) -> float` method that receives a program and a list of 
            examples from a benchmark's train/dev/test set and return a float score. Must also have a `metric(example, prediction) -> float` 
            method that evaluates a single example. If not provided, will construct a default evaluator using the benchmark's evaluate method.
        eval_rounds (Optional[int]): number of rounds to evaluate the program. Defaults to 1. 
        metric_threshold (Optional[float]): threshold for the metric score. If provided, only examples with scores above this threshold will be used as demonstrations. 
            If not provided, examples with scores above 0 will be used as demonstrations. 
        max_bootstrapped_demos (int): maximum number of bootstrapped demonstrations to use. Defaults to 4.
        max_labeled_demos (int): maximum number of labeled demonstrations to use. Defaults to 4.
        auto (Optional[Literal["light", "medium", "heavy"]]): automatic configuration mode. If set, will override num_candidates and max_steps. 
            "light": n=6, val_size=100; "medium": n=12, val_size=300; "heavy": n=18, val_size=1000. Defaults to "medium".
        max_steps (int): maximum number of optimization steps. Required if auto is None.
        num_candidates (Optional[int]): number of candidates to generate for each optimization step. Required if auto is None.
        num_threads (Optional[int]): number of threads to use for parallel evaluation. If None, will use single thread. Only used if evaluator is not provided. 
        max_errors (int): maximum number of errors allowed during evaluation before stopping. Defaults to 10.
        seed (int): random seed for reproducibility. Defaults to 9.
        init_temperature (float): initial temperature for instruction generation. Defaults to 0.5.
        track_stats (bool): whether to track optimization statistics. Defaults to True.
        save_path (Optional[str]): path to save optimization results. If None, results will not be saved.
        minibatch (bool): whether to use minibatch evaluation during optimization. Defaults to True.
        minibatch_size (int): size of minibatch for evaluation. Defaults to 35.
        minibatch_full_eval_steps (int): number of minibatch steps between full evaluations. Defaults to 5.
        program_aware_proposer (bool): whether to use program-aware instruction proposer. Defaults to True.
        data_aware_proposer (bool): whether to use data-aware instruction proposer. Defaults to True.
        view_data_batch_size (int): batch size for viewing data during instruction proposal. Defaults to 10.
        tip_aware_proposer (bool): whether to use tip-aware instruction proposer. Defaults to True.
        fewshot_aware_proposer (bool): whether to use fewshot-aware instruction proposer. Defaults to True.
        requires_permission_to_run (bool): whether to require user permission before running optimization. Defaults to False.
        provide_traceback (Optional[bool]): whether to provide traceback for evaluation errors. If None, will use default setting.
        **kwargs: additional keyword arguments to pass to the evaluator.

    Raises:
        TypeError: If program is not callable or evaluator doesn't return float
        ValueError: If program doesn't have required methods (save and load) or if evaluator doesn't have required methods
    """

    # initialize base optimizer
    BaseOptimizer.__init__(self, registry=registry, program=program, evaluator=evaluator)

    # convert the registry and program to dspy-compatible module
    self._validate_program(program=program)
    self.model = self._convert_to_dspy_module(registry, program)
    self.optimizer_llm = MiproLMWrapper(optimizer_llm)
    dspy.configure(lm=self.optimizer_llm)
    self.task_model = dspy.settings.lm 
    self.prompt_model = dspy.settings.lm 
    self.metric_threshold = metric_threshold
    self.metric_name = None 
    self.teacher_settings = {"use_teacher": True} 

    # Validate 'auto' parameter
    allowed_modes = {None, "light", "medium", "heavy"}
    if auto not in allowed_modes:
        raise ValueError(f"Invalid value for auto: {auto}. Must be one of {allowed_modes}.")
    self.auto = auto
    self.num_fewshot_candidates = num_candidates
    self.num_instruct_candidates = num_candidates
    self.num_candidates = num_candidates
    self.init_temperature = init_temperature
    self.max_bootstrapped_demos = max_bootstrapped_demos
    self.max_labeled_demos = max_labeled_demos
    self.max_steps = max_steps
    self.num_threads = num_threads
    self.max_errors = max_errors

    self.track_stats = track_stats
    self.eval_rounds = eval_rounds 
    self.save_path = save_path
    self.prompt_model_total_calls = 0
    self.total_calls = 0
    self.seed = seed
    self.rng = None

    self.minibatch = minibatch 
    self.minibatch_size = minibatch_size 
    self.minibatch_full_eval_steps = minibatch_full_eval_steps 
    self.program_aware_proposer = program_aware_proposer 
    self.data_aware_proposer = data_aware_proposer 
    self.view_data_batch_size = view_data_batch_size 
    self.tip_aware_proposer = tip_aware_proposer 
    self.fewshot_aware_proposer = fewshot_aware_proposer 
    self.requires_permission_to_run = requires_permission_to_run 
    self.provide_traceback = provide_traceback 
    self.verbose = verbose
    self.kwargs = kwargs 

optimize ¶

optimize(dataset: Benchmark, metric_name: Optional[str] = None, **kwargs)

Optimize the program using the Mipro algorithm.

Parameters:

Name	Type	Description	Default
`dataset`	`Benchmark`	a Benchmark object that contains the training and validation data.	required
`metric_name`	`Optional[str]`	the name of the metric to use for optimization. Only used when `self.evaluator` is not provided. In this case, the evaluator will be constructed using the `evaluate` method (return a dictionary of scores) in the benchmark, and the metric specified by `metric_name` will be used for optimization. If not provided, the average of all scores returned by the evaluator will be used. If `self.evaluator` is provided, this argument will be ignored.	`None`
`**kwargs`		additional keyword arguments to pass to the evaluator.	`{}`

Source code in evoagentx/optimizers/mipro_optimizer.py

def optimize(self, dataset: Benchmark, metric_name: Optional[str] = None, **kwargs):

    """
    Optimize the program using the Mipro algorithm. 

    Args:
        dataset (Benchmark): a Benchmark object that contains the training and validation data. 
        metric_name (Optional[str]): the name of the metric to use for optimization. Only used when `self.evaluator` is not provided. 
            In this case, the evaluator will be constructed using the `evaluate` method (return a dictionary of scores) in the benchmark, 
            and the metric specified by `metric_name` will be used for optimization. If not provided, the average of all scores returned by the evaluator will be used. 
            If `self.evaluator` is provided, this argument will be ignored. 
        **kwargs: additional keyword arguments to pass to the evaluator. 
    """

    zeroshot_opt = (self.max_bootstrapped_demos == 0) and (self.max_labeled_demos == 0)
    student = self.model
    num_trials = self.max_steps
    minibatch = self.minibatch
    self.metric_name = metric_name

    # If auto is None, and num_trials is not provided (but num_candidates is), raise an error that suggests a good num_trials value
    if self.auto is None and (self.num_candidates is not None and num_trials is None):
        raise ValueError(f"If auto is None, max_steps must also be provided. Given num_candidates={self.num_candidates}, we'd recommend setting max_steps to ~{self._set_num_trials_from_num_candidates(self.model, zeroshot_opt, self.num_candidates)}.")

    # If auto is None, and num_candidates or num_trials is None, raise an error
    if self.auto is None and (self.num_candidates is None or num_trials is None):
        raise ValueError("If auto is None, num_candidates must also be provided.")

    # If auto is provided, and either num_candidates or num_trials is not None, raise an error
    if self.auto is not None and (self.num_candidates is not None or num_trials is not None):
        raise ValueError("If auto is not None, num_candidates and max_steps cannot be set, since they would be overrided by the auto settings. Please either set auto to None, or do not specify num_candidates and max_steps.")

    # Set random seeds
    seed = self.seed
    self._set_random_seeds(seed)

    # Set training & validation sets
    trainset, valset = self._set_and_validate_datasets(dataset=dataset)

    # Set hyperparameters based on run mode (if set)
    num_trials, valset, minibatch = self._set_hyperparams_from_run_mode(
        student, num_trials, minibatch, zeroshot_opt, valset
    )

    if self.auto: 
        self._print_auto_run_settings(num_trials, minibatch, valset)

    if minibatch and self.minibatch_size > len(valset):
        raise ValueError(f"Minibatch size cannot exceed the size of the valset. Valset size: {len(valset)}.")

    # # Estimate LM calls and get user confirmation
    if self.requires_permission_to_run:
        if not self._get_user_confirmation(
            student,
            num_trials,
            minibatch,
            self.minibatch_size,
            self.minibatch_full_eval_steps,
            valset,
            self.program_aware_proposer,
        ):
            logger.info("Compilation aborted by the user.")
            return student  # Return the original student program

    program = student.deepcopy()

    # check the evaluator (If None, will construct a default evaluator using the `evaluate` method in the benchmark) and wrap it with runtime checks
    evaluator = self._validate_evaluator(evaluator=self.evaluator, benchmark=dataset, metric_name=metric_name)
    self.metric = evaluator.metric

    # Step 1: Bootstrap few-shot examples 
    demo_candidates = self._bootstrap_fewshot_examples(program, trainset, seed, teacher=None)

    # Step 2: Propose instruction candidates 
    with suppress_cost_logging():
        instruction_candidates = self._propose_instructions(
            program,
            trainset,
            demo_candidates,
            self.view_data_batch_size,
            self.program_aware_proposer,
            self.data_aware_proposer,
            self.tip_aware_proposer,
            self.fewshot_aware_proposer,
        )

    # Step 3: Find optimal prompt parameters 
    with suppress_cost_logging():
        best_program = self._optimize_prompt_parameters(
            program,
            instruction_candidates,
            demo_candidates,
            evaluator,
            valset,
            num_trials,
            minibatch,
            self.minibatch_size,
            self.minibatch_full_eval_steps,
            seed,
        )

    if self.save_path:
        os.makedirs(self.save_path, exist_ok=True)
        self.best_program_path = os.path.join(self.save_path, "best_program.json")
        best_program.save(self.best_program_path)

    # reset the self.model. After optimization, the model will be reset to the original state.
    # This is necessary to avoid the model being modified by the optimization process. 
    # Use self.restore_best_program() to restore the best program. 
    self.model.reset()

WorkFlowMiproOptimizer ¶

WorkFlowMiproOptimizer(graph: WorkFlowGraph, evaluator: Evaluator, optimizer_llm: Optional[BaseLLM] = None, **kwargs)

Bases: MiproOptimizer

MiproOptimizer tailored for workflow graphs.

Parameters:

Name	Type	Description	Default
`graph`	`WorkFlowGraph`	the workflow graph to optimize.	required
`evaluator`	`Evaluator`	the evaluator to use for the optimization.	required
`optimizer_llm`	`BaseLLM`	the LLM to use for the optimization. If None, will use the LLM model in the evaluator.	`None`
`**kwargs`		additional keyword arguments to pass to the MiproOptimizer. Available options: - metric_threshold (Optional[int]): threshold for the metric score. If provided, only examples with scores above this threshold will be used as demonstrations. - max_bootstrapped_demos (int): maximum number of bootstrapped demonstrations to use. Defaults to 4. - max_labeled_demos (int): maximum number of labeled demonstrations to use. Defaults to 4. - auto (Optional[Literal["light", "medium", "heavy"]]): automatic configuration mode. If set, will override num_candidates and max_steps. "light": n=6, val_size=100; "medium": n=12, val_size=300; "heavy": n=18, val_size=1000. Defaults to "medium". - max_steps (int): maximum number of optimization steps. Required if auto is None. - num_candidates (Optional[int]): number of candidates to generate for each optimization step. Required if auto is None. - num_threads (Optional[int]): number of threads to use for parallel evaluation. If None, will use single thread. - max_errors (int): maximum number of errors allowed during evaluation before stopping. Defaults to 10. - seed (int): random seed for reproducibility. Defaults to 9. - init_temperature (float): initial temperature for instruction generation. Defaults to 0.5. - track_stats (bool): whether to track optimization statistics. Defaults to True. - save_path (Optional[str]): path to save optimization results. If None, results will not be saved. - minibatch (bool): whether to use minibatch evaluation during optimization. Defaults to True. - minibatch_size (int): size of minibatch for evaluation. Defaults to 35. - minibatch_full_eval_steps (int): number of minibatch steps between full evaluations. Defaults to 5. - program_aware_proposer (bool): whether to use program-aware instruction proposer. Defaults to True. - data_aware_proposer (bool): whether to use data-aware instruction proposer. Defaults to True. - view_data_batch_size (int): batch size for viewing data during instruction proposal. Defaults to 10. - tip_aware_proposer (bool): whether to use tip-aware instruction proposer. Defaults to True. - fewshot_aware_proposer (bool): whether to use fewshot-aware instruction proposer. Defaults to True. - requires_permission_to_run (bool): whether to require user permission before running optimization. Defaults to False. - provide_traceback (Optional[bool]): whether to provide traceback for evaluation errors. If None, will use default setting.	`{}`

Source code in evoagentx/optimizers/mipro_optimizer.py

def __init__(
    self, 
    graph: WorkFlowGraph,
    evaluator: Evaluator, 
    optimizer_llm: Optional[BaseLLM] = None, 
    **kwargs, 
):
    """
    MiproOptimizer tailored for workflow graphs. 

    Args:
        graph (WorkFlowGraph): the workflow graph to optimize.
        evaluator (Evaluator): the evaluator to use for the optimization.
        optimizer_llm (BaseLLM): the LLM to use for the optimization. If None, will use the LLM model in the evaluator.
        **kwargs: additional keyword arguments to pass to the MiproOptimizer. Available options:
            - metric_threshold (Optional[int]): threshold for the metric score. If provided, only examples with scores above this threshold will be used as demonstrations.
            - max_bootstrapped_demos (int): maximum number of bootstrapped demonstrations to use. Defaults to 4.
            - max_labeled_demos (int): maximum number of labeled demonstrations to use. Defaults to 4.
            - auto (Optional[Literal["light", "medium", "heavy"]]): automatic configuration mode. If set, will override num_candidates and max_steps. 
                "light": n=6, val_size=100; "medium": n=12, val_size=300; "heavy": n=18, val_size=1000. Defaults to "medium".
            - max_steps (int): maximum number of optimization steps. Required if auto is None.
            - num_candidates (Optional[int]): number of candidates to generate for each optimization step. Required if auto is None.
            - num_threads (Optional[int]): number of threads to use for parallel evaluation. If None, will use single thread.
            - max_errors (int): maximum number of errors allowed during evaluation before stopping. Defaults to 10.
            - seed (int): random seed for reproducibility. Defaults to 9.
            - init_temperature (float): initial temperature for instruction generation. Defaults to 0.5.
            - track_stats (bool): whether to track optimization statistics. Defaults to True.
            - save_path (Optional[str]): path to save optimization results. If None, results will not be saved.
            - minibatch (bool): whether to use minibatch evaluation during optimization. Defaults to True.
            - minibatch_size (int): size of minibatch for evaluation. Defaults to 35.
            - minibatch_full_eval_steps (int): number of minibatch steps between full evaluations. Defaults to 5.
            - program_aware_proposer (bool): whether to use program-aware instruction proposer. Defaults to True.
            - data_aware_proposer (bool): whether to use data-aware instruction proposer. Defaults to True.
            - view_data_batch_size (int): batch size for viewing data during instruction proposal. Defaults to 10.
            - tip_aware_proposer (bool): whether to use tip-aware instruction proposer. Defaults to True.
            - fewshot_aware_proposer (bool): whether to use fewshot-aware instruction proposer. Defaults to True.
            - requires_permission_to_run (bool): whether to require user permission before running optimization. Defaults to False.
            - provide_traceback (Optional[bool]): whether to provide traceback for evaluation errors. If None, will use default setting.
    """

    # check if the graph is compatible with the WorkFlowMipro optimizer.
    graph = self._validate_graph_compatibility(graph=graph)

    # convert the workflow graph to a callable program  
    workflow_graph_program = WorkFlowGraphProgram(
        graph=graph, 
        agent_manager=evaluator.agent_manager, 
        executor_llm=evaluator.llm, 
        collate_func=evaluator.collate_func, 
        output_postprocess_func=evaluator.output_postprocess_func, 
    )

    # register optimizable parameters 
    registry = self._register_optimizable_parameters(program=workflow_graph_program)

    super().__init__(
        registry=registry, 
        program=workflow_graph_program, 
        optimizer_llm=optimizer_llm or evaluator.llm, 
        evaluator=evaluator,
        **kwargs
    )