AutoGPT/classic/benchmark/agbenchmark/main.py

159 lines
4.7 KiB
Python

import logging
import os
from pathlib import Path
from typing import Optional, Sequence
from dotenv import load_dotenv
from agbenchmark.challenges import get_unique_categories
from agbenchmark.config import AgentBenchmarkConfig
load_dotenv()
logger = logging.getLogger(__name__)
def run_benchmark(
config: AgentBenchmarkConfig,
maintain: bool = False,
improve: bool = False,
explore: bool = False,
tests: tuple[str, ...] = tuple(),
categories: tuple[str, ...] = tuple(),
skip_categories: tuple[str, ...] = tuple(),
attempts_per_challenge: int = 1,
mock: bool = False,
no_dep: bool = False,
no_cutoff: bool = False,
cutoff: Optional[int] = None,
keep_answers: bool = False,
server: bool = False,
) -> int:
"""
Starts the benchmark. If a category flag is provided, only challenges with the
corresponding mark will be run.
"""
import pytest
from agbenchmark.reports.ReportManager import SingletonReportManager
validate_args(
maintain=maintain,
improve=improve,
explore=explore,
tests=tests,
categories=categories,
skip_categories=skip_categories,
no_cutoff=no_cutoff,
cutoff=cutoff,
)
SingletonReportManager()
for key, value in vars(config).items():
logger.debug(f"config.{key} = {repr(value)}")
pytest_args = ["-vs"]
if tests:
logger.info(f"Running specific test(s): {' '.join(tests)}")
pytest_args += [f"--test={t}" for t in tests]
else:
all_categories = get_unique_categories()
if categories or skip_categories:
categories_to_run = set(categories) or all_categories
if skip_categories:
categories_to_run = categories_to_run.difference(set(skip_categories))
assert categories_to_run, "Error: You can't skip all categories"
pytest_args += [f"--category={c}" for c in categories_to_run]
logger.info(f"Running tests of category: {categories_to_run}")
else:
logger.info("Running all categories")
if maintain:
logger.info("Running only regression tests")
elif improve:
logger.info("Running only non-regression tests")
elif explore:
logger.info("Only attempt challenges that have never been beaten")
if mock:
# TODO: unhack
os.environ[
"IS_MOCK"
] = "True" # ugly hack to make the mock work when calling from API
# Pass through flags
for flag, active in {
"--maintain": maintain,
"--improve": improve,
"--explore": explore,
"--no-dep": no_dep,
"--mock": mock,
"--nc": no_cutoff,
"--keep-answers": keep_answers,
}.items():
if active:
pytest_args.append(flag)
if attempts_per_challenge > 1:
pytest_args.append(f"--attempts={attempts_per_challenge}")
if cutoff:
pytest_args.append(f"--cutoff={cutoff}")
logger.debug(f"Setting cuttoff override to {cutoff} seconds.")
current_dir = Path(__file__).resolve().parent
pytest_args.append(str(current_dir / "generate_test.py"))
pytest_args.append("--cache-clear")
logger.debug(f"Running Pytest with args: {pytest_args}")
exit_code = pytest.main(pytest_args)
SingletonReportManager.clear_instance()
return exit_code
class InvalidInvocationError(ValueError):
pass
def validate_args(
maintain: bool,
improve: bool,
explore: bool,
tests: Sequence[str],
categories: Sequence[str],
skip_categories: Sequence[str],
no_cutoff: bool,
cutoff: Optional[int],
) -> None:
if categories:
all_categories = get_unique_categories()
invalid_categories = set(categories) - all_categories
if invalid_categories:
raise InvalidInvocationError(
"One or more invalid categories were specified: "
f"{', '.join(invalid_categories)}.\n"
f"Valid categories are: {', '.join(all_categories)}."
)
if (maintain + improve + explore) > 1:
raise InvalidInvocationError(
"You can't use --maintain, --improve or --explore at the same time. "
"Please choose one."
)
if tests and (categories or skip_categories or maintain or improve or explore):
raise InvalidInvocationError(
"If you're running a specific test make sure no other options are "
"selected. Please just pass the --test."
)
if no_cutoff and cutoff:
raise InvalidInvocationError(
"You can't use both --nc and --cutoff at the same time. "
"Please choose one."
)