Skip to content

🧮 Optimizers

evoagentx.optimizers

SEWOptimizer

SEWOptimizer(**kwargs)

Bases: Optimizer

Source code in evoagentx/core/module.py
def __init__(self, **kwargs):
    """
    Initializes a BaseModule instance.

    Args:
        **kwargs (Any): Keyword arguments used to initialize the instance

    Raises:
        ValidationError: When parameter validation fails
        Exception: When other errors occur during initialization
    """

    try:
        for field_name, _ in type(self).model_fields.items():
            field_value = kwargs.get(field_name, None)
            if field_value:
                kwargs[field_name] = self._process_data(field_value)
            # if field_value and isinstance(field_value, dict) and "class_name" in field_value:
            #     class_name = field_value.get("class_name")
            #     sub_cls = MODULE_REGISTRY.get_module(cls_name=class_name)
            #     kwargs[field_name] = sub_cls._create_instance(field_value)
        super().__init__(**kwargs) 
        self.init_module()
    except (ValidationError, Exception) as e:
        exception_handler = callback_manager.get_callback("exception_buffer")
        if exception_handler is None:
            error_message = get_base_module_init_error_message(
                cls=self.__class__, 
                data=kwargs, 
                errors=e
            )
            logger.error(error_message)
            raise
        else:
            exception_handler.add(e)

step

step(**kwargs) -> Union[SequentialWorkFlowGraph, ActionGraph]

Take a step of optimization and return the optimized graph.

Source code in evoagentx/optimizers/sew_optimizer.py
def step(self, **kwargs) -> Union[SequentialWorkFlowGraph, ActionGraph]:
    """
    Take a step of optimization and return the optimized graph.
    """
    graph = self._select_graph_with_highest_score(return_metrics=False)
    if isinstance(graph, SequentialWorkFlowGraph):
        new_graph = self._workflow_graph_step(graph)
    elif isinstance(graph, ActionGraph):
        new_graph = self._action_graph_step(graph)
    else:
        raise ValueError(f"Invalid graph type: {type(graph)}. The graph should be an instance of `WorkFlowGraph` or `ActionGraph`.")
    return new_graph

evaluate

evaluate(dataset: Benchmark, eval_mode: str = 'test', graph: Optional[Union[SequentialWorkFlowGraph, ActionGraph]] = None, indices: Optional[List[int]] = None, sample_k: Optional[int] = None, **kwargs) -> dict

Evaluate the workflow. If graph is provided, use the provided graph for evaluation. Otherwise, use the graph in the optimizer.

Parameters:

Name Type Description Default
dataset Benchmark

The dataset to evaluate the workflow on.

required
eval_mode str

The evaluation mode. Choices: ["test", "dev", "train"].

'test'
graph Union[WorkFlowGraph, ActionGraph]

The graph to evaluate. If not provided, use the graph in the optimizer.

None
indices List[int]

The indices of the data to evaluate the workflow on.

None
sample_k int

The number of data to evaluate the workflow on. If provided, a random sample of size sample_k will be used.

None

Returns:

Name Type Description
dict dict

The metrics of the workflow evaluation.

Source code in evoagentx/optimizers/sew_optimizer.py
def evaluate(
    self, 
    dataset: Benchmark, 
    eval_mode: str = "test", 
    graph: Optional[Union[SequentialWorkFlowGraph, ActionGraph]] = None,
    indices: Optional[List[int]] = None,
    sample_k: Optional[int] = None,
    **kwargs
) -> dict:
    """
    Evaluate the workflow. If `graph` is provided, use the provided graph for evaluation. Otherwise, use the graph in the optimizer. 

    Args:
        dataset (Benchmark): The dataset to evaluate the workflow on.
        eval_mode (str): The evaluation mode. Choices: ["test", "dev", "train"].
        graph (Union[WorkFlowGraph, ActionGraph], optional): The graph to evaluate. If not provided, use the graph in the optimizer.
        indices (List[int], optional): The indices of the data to evaluate the workflow on.
        sample_k (int, optional): The number of data to evaluate the workflow on. If provided, a random sample of size `sample_k` will be used.

    Returns:
        dict: The metrics of the workflow evaluation.
    """
    graph = graph if graph is not None else self.graph
    metrics_list = []
    for i in range(self.eval_rounds):
        eval_info = [
            f"[{type(graph).__name__}]", 
            f"Evaluation round {i+1}/{self.eval_rounds}", 
            f"Mode: {eval_mode}"
        ]
        if indices is not None:
            eval_info.append(f"Indices: {len(indices)} samples")
        if sample_k is not None:
            eval_info.append(f"Sample size: {sample_k}")
        logger.info(" | ".join(eval_info))
        metrics = self.evaluator.evaluate(
            graph=graph, 
            benchmark=dataset, 
            eval_mode=eval_mode, 
            indices=indices, 
            sample_k=sample_k,
            **kwargs
        )
        metrics_list.append(metrics)
    avg_metrics = self.evaluator._calculate_average_score(metrics_list)

    return avg_metrics

save

save(path: str, ignore: List[str] = [])

Save the (optimized) workflow graph to a file.

Parameters:

Name Type Description Default
path str

The path to save the workflow graph.

required
ignore List[str]

The keys to ignore when saving the workflow graph.

[]
Source code in evoagentx/optimizers/sew_optimizer.py
def save(self, path: str, ignore: List[str] = []):
    """
    Save the (optimized) workflow graph to a file. 

    Args:
        path (str): The path to save the workflow graph.
        ignore (List[str]): The keys to ignore when saving the workflow graph.
    """
    self.graph.save_module(path, ignore=ignore)

AFlowOptimizer

AFlowOptimizer(**kwargs)

Bases: BaseModule

AFlow Optimizer for workflow optimization.

This optimizer iteratively improves workflows through multiple rounds of optimization using large language models. It evaluates workflow performance, identifies improvement opportunities, and applies optimizations based on experience and convergence metrics.

Attributes:

Name Type Description
question_type str

Type of task to optimize for (e.g., qa, match, code)

graph_path str

Path to the workflow graph directory (must contain graph.py and prompt.py)

optimized_path str

Path to save optimized workflows (defaults to graph_path)

initial_round int

Starting round number for optimization

optimizer_llm BaseLLM

LLM used for generating optimizations

executor_llm BaseLLM

LLM used for executing the workflow

operators List[str]

List of operators available for optimization

sample int

Number of rounds to sample from for optimization

max_rounds int

Maximum number of optimization rounds to perform

validation_rounds int

Number of validation runs per optimization round

eval_rounds int

Number of evaluation runs for test mode

check_convergence bool

Whether to check for optimization convergence

Source code in evoagentx/core/module.py
def __init__(self, **kwargs):
    """
    Initializes a BaseModule instance.

    Args:
        **kwargs (Any): Keyword arguments used to initialize the instance

    Raises:
        ValidationError: When parameter validation fails
        Exception: When other errors occur during initialization
    """

    try:
        for field_name, _ in type(self).model_fields.items():
            field_value = kwargs.get(field_name, None)
            if field_value:
                kwargs[field_name] = self._process_data(field_value)
            # if field_value and isinstance(field_value, dict) and "class_name" in field_value:
            #     class_name = field_value.get("class_name")
            #     sub_cls = MODULE_REGISTRY.get_module(cls_name=class_name)
            #     kwargs[field_name] = sub_cls._create_instance(field_value)
        super().__init__(**kwargs) 
        self.init_module()
    except (ValidationError, Exception) as e:
        exception_handler = callback_manager.get_callback("exception_buffer")
        if exception_handler is None:
            error_message = get_base_module_init_error_message(
                cls=self.__class__, 
                data=kwargs, 
                errors=e
            )
            logger.error(error_message)
            raise
        else:
            exception_handler.add(e)

optimize

optimize(benchmark: Benchmark)

Run the optimization process on the workflow.

Performs multiple rounds of optimization, evaluating each round against the benchmark and checking for convergence. Continues until convergence is detected or the maximum number of rounds is reached.

Parameters:

Name Type Description Default
benchmark Benchmark

The benchmark to evaluate the workflow against

required
Source code in evoagentx/optimizers/aflow_optimizer.py
def optimize(self, benchmark: Benchmark):
    """Run the optimization process on the workflow.

    Performs multiple rounds of optimization, evaluating each round against
    the benchmark and checking for convergence. Continues until convergence
    is detected or the maximum number of rounds is reached.

    Args:
        benchmark: The benchmark to evaluate the workflow against
    """
    self.benchmark = benchmark
    for _ in range(self.max_rounds):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        score = loop.run_until_complete(self._execute_with_retry(self._optimize_graph))
        self.round += 1
        logger.info(f"Score for round {self.round}: {score}")
        if self._check_convergence():
            break
        if self.round >= self.max_rounds:
            logger.info(f"Max rounds reached: {self.max_rounds}, stopping optimization.")
            break

test

test(benchmark: Benchmark, test_rounds: List[int] = None)

Run the test evaluation on optimized workflows.

Evaluates specified rounds (or the best round if none specified) against the benchmark multiple times and logs the results.

Parameters:

Name Type Description Default
benchmark Benchmark

The benchmark to evaluate against

required
test_rounds List[int]

Specific round numbers to test, or None to use the best round

None
Source code in evoagentx/optimizers/aflow_optimizer.py
def test(self, benchmark: Benchmark, test_rounds: List[int] = None):
    """Run the test evaluation on optimized workflows.

    Evaluates specified rounds (or the best round if none specified) against
    the benchmark multiple times and logs the results.

    Args:
        benchmark: The benchmark to evaluate against
        test_rounds: Specific round numbers to test, or None to use the best round
    """
    self.benchmark = benchmark
    if test_rounds is None:
        best_round = self._load_best_round()
        logger.info(f"No test rounds provided, using best round: {best_round}")
        test_rounds = [best_round]
    for _ in tqdm(range(self.eval_rounds)):
        loop = asyncio.new_event_loop()
        asyncio.set_event_loop(loop)
        loop.run_until_complete(self._run_test(test_rounds))

TextGradOptimizer

TextGradOptimizer(**kwargs)

Bases: BaseModule

Uses TextGrad to optimize agents' system prompts and instructions in a multi-agent workflow. For more information on TextGrad, see https://github.com/zou-group/textgrad.

Source code in evoagentx/core/module.py
def __init__(self, **kwargs):
    """
    Initializes a BaseModule instance.

    Args:
        **kwargs (Any): Keyword arguments used to initialize the instance

    Raises:
        ValidationError: When parameter validation fails
        Exception: When other errors occur during initialization
    """

    try:
        for field_name, _ in type(self).model_fields.items():
            field_value = kwargs.get(field_name, None)
            if field_value:
                kwargs[field_name] = self._process_data(field_value)
            # if field_value and isinstance(field_value, dict) and "class_name" in field_value:
            #     class_name = field_value.get("class_name")
            #     sub_cls = MODULE_REGISTRY.get_module(cls_name=class_name)
            #     kwargs[field_name] = sub_cls._create_instance(field_value)
        super().__init__(**kwargs) 
        self.init_module()
    except (ValidationError, Exception) as e:
        exception_handler = callback_manager.get_callback("exception_buffer")
        if exception_handler is None:
            error_message = get_base_module_init_error_message(
                cls=self.__class__, 
                data=kwargs, 
                errors=e
            )
            logger.error(error_message)
            raise
        else:
            exception_handler.add(e)

optimize

optimize(dataset: Benchmark, use_answers: bool = True, seed: Optional[int] = None) -> None

Optimizes self.graph using dataset.

Parameters:

Name Type Description Default
dataset Benchmark

The dataset to use for optimization.

required
use_answers bool

Whether to use the answers (labels) in the training set for optimization. If False, dataset's training set does not need to have answers. If eval_every_n_steps is set to None, we can optimize the workflow without any labeled data.

True
seed Optional[int]

The random seed to use for shuffling the data.

None
Source code in evoagentx/optimizers/textgrad_optimizer.py
def optimize(self, dataset: Benchmark, use_answers: bool = True, seed: Optional[int] = None) -> None:
    """Optimizes self.graph using `dataset`.

    Args:
        dataset (Benchmark): The dataset to use for optimization.
        use_answers (bool): Whether to use the answers (labels) in the training set for optimization.
            If False, `dataset`'s training set does not need to have answers.
            If `eval_every_n_steps` is set to None, we can optimize the workflow without any labeled data.
        seed (Optional[int]): The random seed to use for shuffling the data.
    """
    self._init_textgrad(dataset, use_answers)

    def iterator() -> Iterator[Tuple[List[dict[str, str]],  Optional[List[Union[str, dict[str, str]]]]]]:
        epoch = 0
        while True:
            # Shuffle train data every epoch
            effective_seed = seed + epoch if seed is not None else None
            train_data = dataset.get_train_data(sample_k=len(dataset._train_data), seed=effective_seed)
            for i in range(0, len(train_data), self.batch_size):
                batch = train_data[i:i + self.batch_size]
                inputs = [self.evaluator.collate_func(x) for x in batch]
                if use_answers:
                    labels = dataset.get_labels(batch)
                else:
                    labels = None
                yield inputs, labels
            epoch += 1

    data_iterator = iterator()

    for step in tqdm(range(self.max_steps)):
        inputs, labels = next(data_iterator)
        self.step(inputs, labels, dataset, use_answers)

        if self.eval_every_n_steps is not None and (step + 1) % self.eval_every_n_steps == 0:
            logger.info(f"Evaluating the workflow at step {step+1} ...")
            with suppress_logger_info():
                metrics = self.evaluate(dataset, **self.eval_config)
            self.log_snapshot(self.graph, metrics)
            logger.info(f"Step {step+1} metrics: {metrics}")

            # If rollback is enabled, keep track of the best snapshot
            if self.rollback:
                if len(self._snapshot) == 1:
                    best_snapshot = self._snapshot[-1]
                    best_average_score = np.mean(list(metrics.values()))
                else:
                    current_average_score = np.mean(list(metrics.values()))

                    if current_average_score >= best_average_score:
                        # If the current average score is better than the best average score, update the best snapshot
                        best_snapshot = self._snapshot[-1]
                        best_average_score = current_average_score
                    else:
                        # If the current average score is worse than the best average score, roll back to the best snapshot
                        logger.info(f"Metrics are worse than the best snapshot which has {best_snapshot['metrics']}. Rolling back to the best snapshot.")
                        best_graph = WorkFlowGraph.from_dict(best_snapshot["graph"])
                        self.graph = best_graph
                        self._create_textgrad_agents()

        if self.save_interval is not None and (step + 1) % self.save_interval == 0:
            logger.info(f"Saving the workflow at step {step+1} ...")
            self.save(os.path.join(self.save_path, f"{dataset.name}_textgrad_step_{step+1}.json"))

    logger.info(f"Reached the maximum number of steps {self.max_steps}. Optimization has finished.")
    self.save(os.path.join(self.save_path, f"{dataset.name}_textgrad_final.json"))

    # Saves the best graph
    if len(self._snapshot) > 0:
        best_graph = self._select_graph_with_highest_score()
        self.save(os.path.join(self.save_path, f"{dataset.name}_textgrad_best.json"), graph=best_graph)

step

step(inputs: list[dict[str, str]], labels: Optional[list[Union[str, dict[str, str]]]], dataset: Benchmark, use_answers: bool = True) -> None

Performs one optimization step using a batch of data.

Source code in evoagentx/optimizers/textgrad_optimizer.py
def step(
    self, 
    inputs: list[dict[str, str]], 
    labels: Optional[list[Union[str, dict[str, str]]]], 
    dataset: Benchmark, 
    use_answers: bool = True
) -> None:
    """Performs one optimization step using a batch of data."""

    losses = []
    logger.info("Executing workflow...")

    if use_answers:
        if labels is None:
            raise ValueError("Labels must be provided if `use_answers` is True.")

        for input, label in zip(inputs, labels, strict=True):
            output = self.forward(input)
            if isinstance(label, str):
                label = Variable(label, requires_grad=False, role_description="correct answer for the query")
            elif isinstance(label, dict):
                if not isinstance(dataset, CodingBenchmark):
                    raise ValueError("Label must be a string for non-coding benchmarks.")
                end_node_name = self.graph.find_end_nodes()[0]
                end_node = self.graph.get_node(end_node_name)
                output_name = end_node.outputs[0].name
                code = output.parsed_outputs[output_name]
                label = self._format_code_label(code, label, dataset)
                label = Variable(label, requires_grad=False, role_description="the task, the test result, and the correct code")
            loss = self.loss_fn([output, label])
            losses.append(loss)
    else:
        for input in inputs:
            output = self.forward(input)
            loss = self.loss_fn(output)
            losses.append(loss)

    total_loss = tg.sum(losses)
    logger.info("Computing gradients...")
    total_loss.backward(self.optimizer_engine)
    logger.info("Updating agents...")
    self.textgrad_optimizer.step()
    self.textgrad_optimizer.zero_grad()
    self._update_workflow_graph()
    logger.info("Agents updated")

forward

forward(inputs: dict[str, str]) -> Variable

Returns the final output from the workflow.

Source code in evoagentx/optimizers/textgrad_optimizer.py
def forward(self, inputs: dict[str, str]) -> Variable:
    """Returns the final output from the workflow."""
    self._visited_nodes = set()
    end_node = self.graph.find_end_nodes()[0]
    input_variables = self._initial_inputs_to_variables(inputs)
    output = self._compute_node(end_node, input_variables)
    return output

evaluate

evaluate(dataset: Benchmark, eval_mode: str = 'dev', graph: Optional[WorkFlowGraph] = None, indices: Optional[List[int]] = None, sample_k: Optional[int] = None, **kwargs) -> dict

Evaluate the workflow. If graph is provided, use the provided graph for evaluation. Otherwise, use the graph in the optimizer.

Parameters:

Name Type Description Default
dataset Benchmark

The dataset to evaluate the workflow on.

required
eval_mode str

The evaluation mode. Choices: ["test", "dev", "train"].

'dev'
graph WorkFlowGraph

The graph to evaluate. If not provided, use the graph in the optimizer.

None
indices List[int]

The indices of the data to evaluate the workflow on.

None
sample_k int

The number of data to evaluate the workflow on. If provided, a random sample of size sample_k will be used.

None

Returns:

Name Type Description
dict dict

The metrics of the workflow evaluation.

Source code in evoagentx/optimizers/textgrad_optimizer.py
def evaluate(
    self, 
    dataset: Benchmark, 
    eval_mode: str = "dev", 
    graph: Optional[WorkFlowGraph] = None,
    indices: Optional[List[int]] = None,
    sample_k: Optional[int] = None,
    **kwargs
) -> dict:
    """Evaluate the workflow. If `graph` is provided, use the provided graph for evaluation. Otherwise, use the graph in the optimizer. 

    Args:
        dataset (Benchmark): The dataset to evaluate the workflow on.
        eval_mode (str): The evaluation mode. Choices: ["test", "dev", "train"].
        graph (WorkFlowGraph, optional): The graph to evaluate. If not provided, use the graph in the optimizer.
        indices (List[int], optional): The indices of the data to evaluate the workflow on.
        sample_k (int, optional): The number of data to evaluate the workflow on. If provided, a random sample of size `sample_k` will be used.

    Returns:
        dict: The metrics of the workflow evaluation.
    """
    if graph is None:
        graph = self.graph

    metrics_list = []
    for i in range(self.eval_rounds):
        eval_info = [
            f"[{type(graph).__name__}]", 
            f"Evaluation round {i+1}/{self.eval_rounds}", 
            f"Mode: {eval_mode}"
        ]
        if indices is not None:
            eval_info.append(f"Indices: {len(indices)} samples")
        if sample_k is not None:
            eval_info.append(f"Sample size: {sample_k}")
        logger.info(" | ".join(eval_info))
        metrics = self.evaluator.evaluate(
            graph=graph, 
            benchmark=dataset, 
            eval_mode=eval_mode, 
            indices=indices, 
            sample_k=sample_k,
            update_agents=True, 
            **kwargs
        )
        metrics_list.append(metrics)
    avg_metrics = self.evaluator._calculate_average_score(metrics_list)

    return avg_metrics

save

save(path: str, graph: Optional[WorkFlowGraph] = None, ignore: List[str] = []) -> None

Save the workflow graph containing the optimized prompts to a file.

Parameters:

Name Type Description Default
path str

The path to save the workflow graph.

required
graph WorkFlowGraph

The graph to save. If not provided, use the graph in the optimizer.

None
ignore List[str]

The keys to ignore when saving the workflow graph.

[]
Source code in evoagentx/optimizers/textgrad_optimizer.py
def save(self, path: str, graph: Optional[WorkFlowGraph] = None, ignore: List[str] = []) -> None:
    """Save the workflow graph containing the optimized prompts to a file. 

    Args:
        path (str): The path to save the workflow graph.
        graph (WorkFlowGraph, optional): The graph to save. If not provided, use the graph in the optimizer.
        ignore (List[str]): The keys to ignore when saving the workflow graph.
    """
    if graph is None:
        graph = self.graph
    graph.save_module(path, ignore=ignore)

log_snapshot

log_snapshot(graph: WorkFlowGraph, metrics: dict) -> None

Log the snapshot of the workflow.

Source code in evoagentx/optimizers/textgrad_optimizer.py
def log_snapshot(self, graph: WorkFlowGraph, metrics: dict) -> None:
    """Log the snapshot of the workflow."""
    self._snapshot.append(
        {
            "index": len(self._snapshot),
            "graph": deepcopy(graph.get_config()),
            "metrics": metrics,
        }
    )

restore_best_graph

restore_best_graph() -> None

Restore the best graph from the snapshot and set it to self.graph.

Source code in evoagentx/optimizers/textgrad_optimizer.py
def restore_best_graph(self) -> None:
    """Restore the best graph from the snapshot and set it to `self.graph`."""
    if len(self._snapshot) == 0:
        logger.info("No snapshot found. No graph to restore.")
        return

    best_graph, best_metrics = self._select_graph_with_highest_score(return_metrics=True)
    self.graph = best_graph
    logger.info(f"Restored the best graph from snapshot with metrics {best_metrics}")

MiproOptimizer

MiproOptimizer(registry: ParamRegistry, program: Callable, optimizer_llm: BaseLLM, evaluator: Optional[Callable] = None, eval_rounds: Optional[int] = 1, metric_threshold: Optional[float] = None, max_bootstrapped_demos: int = 4, max_labeled_demos: int = 4, auto: Optional[Literal['light', 'medium', 'heavy']] = 'medium', max_steps: int = None, num_candidates: Optional[int] = None, num_threads: Optional[int] = None, max_errors: int = 10, seed: int = 9, init_temperature: float = 0.5, track_stats: bool = True, save_path: Optional[str] = None, minibatch: bool = True, minibatch_size: int = 35, minibatch_full_eval_steps: int = 5, program_aware_proposer: bool = True, data_aware_proposer: bool = True, view_data_batch_size: int = 10, tip_aware_proposer: bool = True, fewshot_aware_proposer: bool = True, requires_permission_to_run: bool = False, provide_traceback: Optional[bool] = None, verbose: bool = False, **kwargs)

Bases: BaseOptimizer, MIPROv2

Base MiproOptimizer class that supports plug-and-play usage.

Parameters:

Name Type Description Default
registry ParamRegistry

a ParamRegistry object that contains the parameters to optimize.

required
program Callable

a program to optimize. Must be a callable object with save(path) and load(path) methods.

required
optimizer_llm BaseLLM

a language model to use for optimization.

required
evaluator Optional[Callable]

a function that evaluates the performance of the program. Required to have a __call__(program, evalset, *kwargs) -> float method that receives a program and a list of examples from a benchmark's train/dev/test set and return a float score. Must also have a metric(example, prediction) -> float method that evaluates a single example. If not provided, will construct a default evaluator using the benchmark's evaluate method.

None
eval_rounds Optional[int]

number of rounds to evaluate the program. Defaults to 1.

1
metric_threshold Optional[float]

threshold for the metric score. If provided, only examples with scores above this threshold will be used as demonstrations. If not provided, examples with scores above 0 will be used as demonstrations.

None
max_bootstrapped_demos int

maximum number of bootstrapped demonstrations to use. Defaults to 4.

4
max_labeled_demos int

maximum number of labeled demonstrations to use. Defaults to 4.

4
auto Optional[Literal['light', 'medium', 'heavy']]

automatic configuration mode. If set, will override num_candidates and max_steps. "light": n=6, val_size=100; "medium": n=12, val_size=300; "heavy": n=18, val_size=1000. Defaults to "medium".

'medium'
max_steps int

maximum number of optimization steps. Required if auto is None.

None
num_candidates Optional[int]

number of candidates to generate for each optimization step. Required if auto is None.

None
num_threads Optional[int]

number of threads to use for parallel evaluation. If None, will use single thread. Only used if evaluator is not provided.

None
max_errors int

maximum number of errors allowed during evaluation before stopping. Defaults to 10.

10
seed int

random seed for reproducibility. Defaults to 9.

9
init_temperature float

initial temperature for instruction generation. Defaults to 0.5.

0.5
track_stats bool

whether to track optimization statistics. Defaults to True.

True
save_path Optional[str]

path to save optimization results. If None, results will not be saved.

None
minibatch bool

whether to use minibatch evaluation during optimization. Defaults to True.

True
minibatch_size int

size of minibatch for evaluation. Defaults to 35.

35
minibatch_full_eval_steps int

number of minibatch steps between full evaluations. Defaults to 5.

5
program_aware_proposer bool

whether to use program-aware instruction proposer. Defaults to True.

True
data_aware_proposer bool

whether to use data-aware instruction proposer. Defaults to True.

True
view_data_batch_size int

batch size for viewing data during instruction proposal. Defaults to 10.

10
tip_aware_proposer bool

whether to use tip-aware instruction proposer. Defaults to True.

True
fewshot_aware_proposer bool

whether to use fewshot-aware instruction proposer. Defaults to True.

True
requires_permission_to_run bool

whether to require user permission before running optimization. Defaults to False.

False
provide_traceback Optional[bool]

whether to provide traceback for evaluation errors. If None, will use default setting.

None
**kwargs

additional keyword arguments to pass to the evaluator.

{}

Raises:

Type Description
TypeError

If program is not callable or evaluator doesn't return float

ValueError

If program doesn't have required methods (save and load) or if evaluator doesn't have required methods

Source code in evoagentx/optimizers/mipro_optimizer.py
def __init__(
    self,
    registry: ParamRegistry,
    program: Callable,
    optimizer_llm: BaseLLM,
    evaluator: Optional[Callable] = None,
    eval_rounds: Optional[int] = 1, 
    metric_threshold: Optional[float] = None,
    max_bootstrapped_demos: int = 4, 
    max_labeled_demos: int = 4, 
    auto: Optional[Literal["light", "medium", "heavy"]] = "medium", 
    max_steps: int = None, 
    num_candidates: Optional[int] = None, 
    num_threads: Optional[int] = None, 
    max_errors: int = 10, 
    seed: int = 9, 
    init_temperature: float = 0.5, 
    track_stats: bool = True, 
    save_path: Optional[str] = None,  
    minibatch: bool = True, 
    minibatch_size: int = 35, 
    minibatch_full_eval_steps: int = 5, 
    program_aware_proposer: bool = True,
    data_aware_proposer: bool = True,
    view_data_batch_size: int = 10,
    tip_aware_proposer: bool = True,
    fewshot_aware_proposer: bool = True,
    requires_permission_to_run: bool = False,
    provide_traceback: Optional[bool] = None,
    verbose: bool = False, 
    **kwargs
):
    """
    Base MiproOptimizer class that supports plug-and-play usage. 

    Args: 
        registry (ParamRegistry): a ParamRegistry object that contains the parameters to optimize. 
        program (Callable): a program to optimize. Must be a callable object with save(path) and load(path) methods.
        optimizer_llm (BaseLLM): a language model to use for optimization. 
        evaluator (Optional[Callable]): a function that evaluates the performance of the program. 
            Required to have a `__call__(program, evalset, *kwargs) -> float` method that receives a program and a list of 
            examples from a benchmark's train/dev/test set and return a float score. Must also have a `metric(example, prediction) -> float` 
            method that evaluates a single example. If not provided, will construct a default evaluator using the benchmark's evaluate method.
        eval_rounds (Optional[int]): number of rounds to evaluate the program. Defaults to 1. 
        metric_threshold (Optional[float]): threshold for the metric score. If provided, only examples with scores above this threshold will be used as demonstrations. 
            If not provided, examples with scores above 0 will be used as demonstrations. 
        max_bootstrapped_demos (int): maximum number of bootstrapped demonstrations to use. Defaults to 4.
        max_labeled_demos (int): maximum number of labeled demonstrations to use. Defaults to 4.
        auto (Optional[Literal["light", "medium", "heavy"]]): automatic configuration mode. If set, will override num_candidates and max_steps. 
            "light": n=6, val_size=100; "medium": n=12, val_size=300; "heavy": n=18, val_size=1000. Defaults to "medium".
        max_steps (int): maximum number of optimization steps. Required if auto is None.
        num_candidates (Optional[int]): number of candidates to generate for each optimization step. Required if auto is None.
        num_threads (Optional[int]): number of threads to use for parallel evaluation. If None, will use single thread. Only used if evaluator is not provided. 
        max_errors (int): maximum number of errors allowed during evaluation before stopping. Defaults to 10.
        seed (int): random seed for reproducibility. Defaults to 9.
        init_temperature (float): initial temperature for instruction generation. Defaults to 0.5.
        track_stats (bool): whether to track optimization statistics. Defaults to True.
        save_path (Optional[str]): path to save optimization results. If None, results will not be saved.
        minibatch (bool): whether to use minibatch evaluation during optimization. Defaults to True.
        minibatch_size (int): size of minibatch for evaluation. Defaults to 35.
        minibatch_full_eval_steps (int): number of minibatch steps between full evaluations. Defaults to 5.
        program_aware_proposer (bool): whether to use program-aware instruction proposer. Defaults to True.
        data_aware_proposer (bool): whether to use data-aware instruction proposer. Defaults to True.
        view_data_batch_size (int): batch size for viewing data during instruction proposal. Defaults to 10.
        tip_aware_proposer (bool): whether to use tip-aware instruction proposer. Defaults to True.
        fewshot_aware_proposer (bool): whether to use fewshot-aware instruction proposer. Defaults to True.
        requires_permission_to_run (bool): whether to require user permission before running optimization. Defaults to False.
        provide_traceback (Optional[bool]): whether to provide traceback for evaluation errors. If None, will use default setting.
        **kwargs: additional keyword arguments to pass to the evaluator.

    Raises:
        TypeError: If program is not callable or evaluator doesn't return float
        ValueError: If program doesn't have required methods (save and load) or if evaluator doesn't have required methods
    """

    # initialize base optimizer
    BaseOptimizer.__init__(self, registry=registry, program=program, evaluator=evaluator)

    # convert the registry and program to dspy-compatible module
    self._validate_program(program=program)
    self.model = self._convert_to_dspy_module(registry, program)
    self.optimizer_llm = MiproLMWrapper(optimizer_llm)
    dspy.configure(lm=self.optimizer_llm)
    self.task_model = dspy.settings.lm 
    self.prompt_model = dspy.settings.lm 
    self.metric_threshold = metric_threshold
    self.metric_name = None 
    self.teacher_settings = {"use_teacher": True} 

    # Validate 'auto' parameter
    allowed_modes = {None, "light", "medium", "heavy"}
    if auto not in allowed_modes:
        raise ValueError(f"Invalid value for auto: {auto}. Must be one of {allowed_modes}.")
    self.auto = auto
    self.num_fewshot_candidates = num_candidates
    self.num_instruct_candidates = num_candidates
    self.num_candidates = num_candidates
    self.init_temperature = init_temperature
    self.max_bootstrapped_demos = max_bootstrapped_demos
    self.max_labeled_demos = max_labeled_demos
    self.max_steps = max_steps
    self.num_threads = num_threads
    self.max_errors = max_errors

    self.track_stats = track_stats
    self.eval_rounds = eval_rounds 
    self.save_path = save_path
    self.prompt_model_total_calls = 0
    self.total_calls = 0
    self.seed = seed
    self.rng = None

    self.minibatch = minibatch 
    self.minibatch_size = minibatch_size 
    self.minibatch_full_eval_steps = minibatch_full_eval_steps 
    self.program_aware_proposer = program_aware_proposer 
    self.data_aware_proposer = data_aware_proposer 
    self.view_data_batch_size = view_data_batch_size 
    self.tip_aware_proposer = tip_aware_proposer 
    self.fewshot_aware_proposer = fewshot_aware_proposer 
    self.requires_permission_to_run = requires_permission_to_run 
    self.provide_traceback = provide_traceback 
    self.verbose = verbose
    self.kwargs = kwargs 

optimize

optimize(dataset: Benchmark, metric_name: Optional[str] = None, **kwargs)

Optimize the program using the Mipro algorithm.

Parameters:

Name Type Description Default
dataset Benchmark

a Benchmark object that contains the training and validation data.

required
metric_name Optional[str]

the name of the metric to use for optimization. Only used when self.evaluator is not provided. In this case, the evaluator will be constructed using the evaluate method (return a dictionary of scores) in the benchmark, and the metric specified by metric_name will be used for optimization. If not provided, the average of all scores returned by the evaluator will be used. If self.evaluator is provided, this argument will be ignored.

None
**kwargs

additional keyword arguments to pass to the evaluator.

{}
Source code in evoagentx/optimizers/mipro_optimizer.py
def optimize(self, dataset: Benchmark, metric_name: Optional[str] = None, **kwargs):

    """
    Optimize the program using the Mipro algorithm. 

    Args:
        dataset (Benchmark): a Benchmark object that contains the training and validation data. 
        metric_name (Optional[str]): the name of the metric to use for optimization. Only used when `self.evaluator` is not provided. 
            In this case, the evaluator will be constructed using the `evaluate` method (return a dictionary of scores) in the benchmark, 
            and the metric specified by `metric_name` will be used for optimization. If not provided, the average of all scores returned by the evaluator will be used. 
            If `self.evaluator` is provided, this argument will be ignored. 
        **kwargs: additional keyword arguments to pass to the evaluator. 
    """

    zeroshot_opt = (self.max_bootstrapped_demos == 0) and (self.max_labeled_demos == 0)
    student = self.model
    num_trials = self.max_steps
    minibatch = self.minibatch
    self.metric_name = metric_name

    # If auto is None, and num_trials is not provided (but num_candidates is), raise an error that suggests a good num_trials value
    if self.auto is None and (self.num_candidates is not None and num_trials is None):
        raise ValueError(f"If auto is None, max_steps must also be provided. Given num_candidates={self.num_candidates}, we'd recommend setting max_steps to ~{self._set_num_trials_from_num_candidates(self.model, zeroshot_opt, self.num_candidates)}.")

    # If auto is None, and num_candidates or num_trials is None, raise an error
    if self.auto is None and (self.num_candidates is None or num_trials is None):
        raise ValueError("If auto is None, num_candidates must also be provided.")

    # If auto is provided, and either num_candidates or num_trials is not None, raise an error
    if self.auto is not None and (self.num_candidates is not None or num_trials is not None):
        raise ValueError("If auto is not None, num_candidates and max_steps cannot be set, since they would be overrided by the auto settings. Please either set auto to None, or do not specify num_candidates and max_steps.")

    # Set random seeds
    seed = self.seed
    self._set_random_seeds(seed)

    # Set training & validation sets
    trainset, valset = self._set_and_validate_datasets(dataset=dataset)

    # Set hyperparameters based on run mode (if set)
    num_trials, valset, minibatch = self._set_hyperparams_from_run_mode(
        student, num_trials, minibatch, zeroshot_opt, valset
    )

    if self.auto: 
        self._print_auto_run_settings(num_trials, minibatch, valset)

    if minibatch and self.minibatch_size > len(valset):
        raise ValueError(f"Minibatch size cannot exceed the size of the valset. Valset size: {len(valset)}.")

    # # Estimate LM calls and get user confirmation
    if self.requires_permission_to_run:
        if not self._get_user_confirmation(
            student,
            num_trials,
            minibatch,
            self.minibatch_size,
            self.minibatch_full_eval_steps,
            valset,
            self.program_aware_proposer,
        ):
            logger.info("Compilation aborted by the user.")
            return student  # Return the original student program

    program = student.deepcopy()

    # check the evaluator (If None, will construct a default evaluator using the `evaluate` method in the benchmark) and wrap it with runtime checks
    evaluator = self._validate_evaluator(evaluator=self.evaluator, benchmark=dataset, metric_name=metric_name)
    self.metric = evaluator.metric

    # Step 1: Bootstrap few-shot examples 
    demo_candidates = self._bootstrap_fewshot_examples(program, trainset, seed, teacher=None)

    # Step 2: Propose instruction candidates 
    with suppress_cost_logging():
        instruction_candidates = self._propose_instructions(
            program,
            trainset,
            demo_candidates,
            self.view_data_batch_size,
            self.program_aware_proposer,
            self.data_aware_proposer,
            self.tip_aware_proposer,
            self.fewshot_aware_proposer,
        )

    # Step 3: Find optimal prompt parameters 
    with suppress_cost_logging():
        best_program = self._optimize_prompt_parameters(
            program,
            instruction_candidates,
            demo_candidates,
            evaluator,
            valset,
            num_trials,
            minibatch,
            self.minibatch_size,
            self.minibatch_full_eval_steps,
            seed,
        )

    if self.save_path:
        os.makedirs(self.save_path, exist_ok=True)
        self.best_program_path = os.path.join(self.save_path, "best_program.json")
        best_program.save(self.best_program_path)

    # reset the self.model. After optimization, the model will be reset to the original state.
    # This is necessary to avoid the model being modified by the optimization process. 
    # Use self.restore_best_program() to restore the best program. 
    self.model.reset()

WorkFlowMiproOptimizer

WorkFlowMiproOptimizer(graph: WorkFlowGraph, evaluator: Evaluator, optimizer_llm: Optional[BaseLLM] = None, **kwargs)

Bases: MiproOptimizer

MiproOptimizer tailored for workflow graphs.

Parameters:

Name Type Description Default
graph WorkFlowGraph

the workflow graph to optimize.

required
evaluator Evaluator

the evaluator to use for the optimization.

required
optimizer_llm BaseLLM

the LLM to use for the optimization. If None, will use the LLM model in the evaluator.

None
**kwargs

additional keyword arguments to pass to the MiproOptimizer. Available options: - metric_threshold (Optional[int]): threshold for the metric score. If provided, only examples with scores above this threshold will be used as demonstrations. - max_bootstrapped_demos (int): maximum number of bootstrapped demonstrations to use. Defaults to 4. - max_labeled_demos (int): maximum number of labeled demonstrations to use. Defaults to 4. - auto (Optional[Literal["light", "medium", "heavy"]]): automatic configuration mode. If set, will override num_candidates and max_steps. "light": n=6, val_size=100; "medium": n=12, val_size=300; "heavy": n=18, val_size=1000. Defaults to "medium". - max_steps (int): maximum number of optimization steps. Required if auto is None. - num_candidates (Optional[int]): number of candidates to generate for each optimization step. Required if auto is None. - num_threads (Optional[int]): number of threads to use for parallel evaluation. If None, will use single thread. - max_errors (int): maximum number of errors allowed during evaluation before stopping. Defaults to 10. - seed (int): random seed for reproducibility. Defaults to 9. - init_temperature (float): initial temperature for instruction generation. Defaults to 0.5. - track_stats (bool): whether to track optimization statistics. Defaults to True. - save_path (Optional[str]): path to save optimization results. If None, results will not be saved. - minibatch (bool): whether to use minibatch evaluation during optimization. Defaults to True. - minibatch_size (int): size of minibatch for evaluation. Defaults to 35. - minibatch_full_eval_steps (int): number of minibatch steps between full evaluations. Defaults to 5. - program_aware_proposer (bool): whether to use program-aware instruction proposer. Defaults to True. - data_aware_proposer (bool): whether to use data-aware instruction proposer. Defaults to True. - view_data_batch_size (int): batch size for viewing data during instruction proposal. Defaults to 10. - tip_aware_proposer (bool): whether to use tip-aware instruction proposer. Defaults to True. - fewshot_aware_proposer (bool): whether to use fewshot-aware instruction proposer. Defaults to True. - requires_permission_to_run (bool): whether to require user permission before running optimization. Defaults to False. - provide_traceback (Optional[bool]): whether to provide traceback for evaluation errors. If None, will use default setting.

{}
Source code in evoagentx/optimizers/mipro_optimizer.py
def __init__(
    self, 
    graph: WorkFlowGraph,
    evaluator: Evaluator, 
    optimizer_llm: Optional[BaseLLM] = None, 
    **kwargs, 
):
    """
    MiproOptimizer tailored for workflow graphs. 

    Args:
        graph (WorkFlowGraph): the workflow graph to optimize.
        evaluator (Evaluator): the evaluator to use for the optimization.
        optimizer_llm (BaseLLM): the LLM to use for the optimization. If None, will use the LLM model in the evaluator.
        **kwargs: additional keyword arguments to pass to the MiproOptimizer. Available options:
            - metric_threshold (Optional[int]): threshold for the metric score. If provided, only examples with scores above this threshold will be used as demonstrations.
            - max_bootstrapped_demos (int): maximum number of bootstrapped demonstrations to use. Defaults to 4.
            - max_labeled_demos (int): maximum number of labeled demonstrations to use. Defaults to 4.
            - auto (Optional[Literal["light", "medium", "heavy"]]): automatic configuration mode. If set, will override num_candidates and max_steps. 
                "light": n=6, val_size=100; "medium": n=12, val_size=300; "heavy": n=18, val_size=1000. Defaults to "medium".
            - max_steps (int): maximum number of optimization steps. Required if auto is None.
            - num_candidates (Optional[int]): number of candidates to generate for each optimization step. Required if auto is None.
            - num_threads (Optional[int]): number of threads to use for parallel evaluation. If None, will use single thread.
            - max_errors (int): maximum number of errors allowed during evaluation before stopping. Defaults to 10.
            - seed (int): random seed for reproducibility. Defaults to 9.
            - init_temperature (float): initial temperature for instruction generation. Defaults to 0.5.
            - track_stats (bool): whether to track optimization statistics. Defaults to True.
            - save_path (Optional[str]): path to save optimization results. If None, results will not be saved.
            - minibatch (bool): whether to use minibatch evaluation during optimization. Defaults to True.
            - minibatch_size (int): size of minibatch for evaluation. Defaults to 35.
            - minibatch_full_eval_steps (int): number of minibatch steps between full evaluations. Defaults to 5.
            - program_aware_proposer (bool): whether to use program-aware instruction proposer. Defaults to True.
            - data_aware_proposer (bool): whether to use data-aware instruction proposer. Defaults to True.
            - view_data_batch_size (int): batch size for viewing data during instruction proposal. Defaults to 10.
            - tip_aware_proposer (bool): whether to use tip-aware instruction proposer. Defaults to True.
            - fewshot_aware_proposer (bool): whether to use fewshot-aware instruction proposer. Defaults to True.
            - requires_permission_to_run (bool): whether to require user permission before running optimization. Defaults to False.
            - provide_traceback (Optional[bool]): whether to provide traceback for evaluation errors. If None, will use default setting.
    """

    # check if the graph is compatible with the WorkFlowMipro optimizer.
    graph = self._validate_graph_compatibility(graph=graph)

    # convert the workflow graph to a callable program  
    workflow_graph_program = WorkFlowGraphProgram(
        graph=graph, 
        agent_manager=evaluator.agent_manager, 
        executor_llm=evaluator.llm, 
        collate_func=evaluator.collate_func, 
        output_postprocess_func=evaluator.output_postprocess_func, 
    )

    # register optimizable parameters 
    registry = self._register_optimizable_parameters(program=workflow_graph_program)

    super().__init__(
        registry=registry, 
        program=workflow_graph_program, 
        optimizer_llm=optimizer_llm or evaluator.llm, 
        evaluator=evaluator,
        **kwargs
    )