From 1018a693dd0eeb4e5c3526756767cf32848c7579 Mon Sep 17 00:00:00 2001 From: ali Date: Thu, 18 Dec 2025 17:54:13 +0200 Subject: [PATCH 01/12] optimization effort --- codeflash/api/aiservice.py | 9 ++-- codeflash/cli_cmds/cli.py | 3 ++ codeflash/code_utils/config_consts.py | 49 +++++++++++++++----- codeflash/code_utils/git_utils.py | 34 -------------- codeflash/optimization/function_optimizer.py | 14 +++--- 5 files changed, 49 insertions(+), 60 deletions(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index 8743ab2ac..217f52167 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -12,7 +12,6 @@ from codeflash.cli_cmds.console import console, logger from codeflash.code_utils.code_replacer import is_zero_diff from codeflash.code_utils.code_utils import unified_diff_strings -from codeflash.code_utils.config_consts import N_CANDIDATES_EFFECTIVE, N_CANDIDATES_LP_EFFECTIVE from codeflash.code_utils.env_utils import get_codeflash_api_key from codeflash.code_utils.git_utils import get_last_commit_author_if_pr_exists, get_repo_owner_and_name from codeflash.code_utils.time_utils import humanize_runtime @@ -130,7 +129,7 @@ def optimize_python_code( # noqa: D417 payload = { "source_code": source_code, "dependency_code": dependency_code, - "num_variants": num_candidates, + "n_candidates": num_candidates, "trace_id": trace_id, "python_version": platform.python_version(), "experiment_metadata": experiment_metadata, @@ -138,7 +137,6 @@ def optimize_python_code( # noqa: D417 "current_username": get_last_commit_author_if_pr_exists(None), "repo_owner": git_repo_owner, "repo_name": git_repo_name, - "n_candidates": N_CANDIDATES_EFFECTIVE, "is_async": is_async, } @@ -172,7 +170,7 @@ def optimize_python_code_line_profiler( # noqa: D417 dependency_code: str, trace_id: str, line_profiler_results: str, - num_candidates: int = 10, + num_candidates: int = 8, experiment_metadata: ExperimentMetadata | None = None, ) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -193,14 +191,13 @@ def optimize_python_code_line_profiler( # noqa: D417 payload = { "source_code": source_code, "dependency_code": dependency_code, - "num_variants": num_candidates, + "n_candidates_lp": num_candidates, "line_profiler_results": line_profiler_results, "trace_id": trace_id, "python_version": platform.python_version(), "experiment_metadata": experiment_metadata, "codeflash_version": codeflash_version, "lsp_mode": is_LSP_enabled(), - "n_candidates_lp": N_CANDIDATES_LP_EFFECTIVE, } console.rule() diff --git a/codeflash/cli_cmds/cli.py b/codeflash/cli_cmds/cli.py index a6e28aaaa..139a25d98 100644 --- a/codeflash/cli_cmds/cli.py +++ b/codeflash/cli_cmds/cli.py @@ -104,6 +104,9 @@ def parse_args() -> Namespace: action="store_true", help="(Deprecated) Async function optimization is now enabled by default. This flag is ignored.", ) + parser.add_argument( + "--effort", type=str, help="Effort level for optimization", choices=["low", "medium", "high"], default="medium" + ) args, unknown_args = parser.parse_known_args() sys.argv[:] = [sys.argv[0], *unknown_args] diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index 6b2805fbf..31bb78db1 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -1,27 +1,20 @@ +from enum import Enum + MAX_TEST_RUN_ITERATIONS = 5 INDIVIDUAL_TESTCASE_TIMEOUT = 15 MAX_FUNCTION_TEST_SECONDS = 60 -N_CANDIDATES = 5 MIN_IMPROVEMENT_THRESHOLD = 0.05 MIN_THROUGHPUT_IMPROVEMENT_THRESHOLD = 0.10 # 10% minimum improvement for async throughput MAX_TEST_FUNCTION_RUNS = 50 MAX_CUMULATIVE_TEST_RUNTIME_NANOSECONDS = 100e6 # 100ms -N_TESTS_TO_GENERATE = 2 TOTAL_LOOPING_TIME = 10.0 # 10 second candidate benchmarking budget COVERAGE_THRESHOLD = 60.0 MIN_TESTCASE_PASSED_THRESHOLD = 6 REPEAT_OPTIMIZATION_PROBABILITY = 0.1 DEFAULT_IMPORTANCE_THRESHOLD = 0.001 -N_CANDIDATES_LP = 6 # LSP-specific -N_CANDIDATES_LSP = 3 -N_TESTS_TO_GENERATE_LSP = 2 TOTAL_LOOPING_TIME_LSP = 10.0 # Kept same timing for LSP mode to avoid in increase in performance reporting -N_CANDIDATES_LP_LSP = 3 - -MAX_N_CANDIDATES = 5 -MAX_N_CANDIDATES_LP = 6 try: from codeflash.lsp.helpers import is_LSP_enabled @@ -30,9 +23,41 @@ except ImportError: _IS_LSP_ENABLED = False -N_CANDIDATES_EFFECTIVE = min(N_CANDIDATES_LSP if _IS_LSP_ENABLED else N_CANDIDATES, MAX_N_CANDIDATES) -N_CANDIDATES_LP_EFFECTIVE = min(N_CANDIDATES_LP_LSP if _IS_LSP_ENABLED else N_CANDIDATES_LP, MAX_N_CANDIDATES_LP) -N_TESTS_TO_GENERATE_EFFECTIVE = N_TESTS_TO_GENERATE_LSP if _IS_LSP_ENABLED else N_TESTS_TO_GENERATE TOTAL_LOOPING_TIME_EFFECTIVE = TOTAL_LOOPING_TIME_LSP if _IS_LSP_ENABLED else TOTAL_LOOPING_TIME MAX_CONTEXT_LEN_REVIEW = 1000 + + +class EffortLevel(str, Enum): + LOW = "low" + MEDIUM = "medium" + HIGH = "high" + + +class Effort: + @staticmethod + def get_number_of_optimizer_candidates(effort: str) -> int: + if effort == EffortLevel.LOW.value: + return 3 + if effort == EffortLevel.MEDIUM.value: + return 4 + if effort == EffortLevel.HIGH.value: + return 5 + msg = f"Invalid effort level: {effort}" + raise ValueError(msg) + + @staticmethod + def get_number_of_optimizer_lp_candidates(effort: str) -> int: + if effort == EffortLevel.LOW.value: + return 3 + if effort == EffortLevel.MEDIUM.value: + return 5 + if effort == EffortLevel.HIGH.value: + return 6 + msg = f"Invalid effort level: {effort}" + raise ValueError(msg) + + @staticmethod + def get_number_of_generated_tests(effort: str) -> int: # noqa: ARG004 + # we don't use effort with generated tests for now + return 2 diff --git a/codeflash/code_utils/git_utils.py b/codeflash/code_utils/git_utils.py index 40a725692..c6501b36e 100644 --- a/codeflash/code_utils/git_utils.py +++ b/codeflash/code_utils/git_utils.py @@ -1,10 +1,7 @@ from __future__ import annotations import os -import shutil -import subprocess import sys -import tempfile import time from functools import cache from io import StringIO @@ -16,7 +13,6 @@ from unidiff import PatchSet from codeflash.cli_cmds.console import logger -from codeflash.code_utils.config_consts import N_CANDIDATES_EFFECTIVE if TYPE_CHECKING: from git import Repo @@ -153,36 +149,6 @@ def check_and_push_branch(repo: git.Repo, git_remote: str | None = "origin", *, return True -def create_worktree_root_dir(module_root: Path) -> tuple[Path | None, Path | None]: - git_root = git_root_dir() if check_running_in_git_repo(module_root) else None - worktree_root_dir = Path(tempfile.mkdtemp()) if git_root else None - return git_root, worktree_root_dir - - -def create_git_worktrees( - git_root: Path | None, worktree_root_dir: Path | None, module_root: Path -) -> tuple[Path | None, list[Path]]: - if git_root and worktree_root_dir: - worktree_root = Path(tempfile.mkdtemp(dir=worktree_root_dir)) - worktrees = [Path(tempfile.mkdtemp(dir=worktree_root)) for _ in range(N_CANDIDATES_EFFECTIVE + 1)] - for worktree in worktrees: - subprocess.run(["git", "worktree", "add", "-d", worktree], cwd=module_root, check=True) - else: - worktree_root = None - worktrees = [] - return worktree_root, worktrees - - -def remove_git_worktrees(worktree_root: Path | None, worktrees: list[Path]) -> None: - try: - for worktree in worktrees: - subprocess.run(["git", "worktree", "remove", "-f", worktree], check=True) - except subprocess.CalledProcessError as e: - logger.warning(f"Error removing worktrees: {e}") - if worktree_root: - shutil.rmtree(worktree_root) - - def get_last_commit_author_if_pr_exists(repo: Repo | None = None) -> str | None: """Return the author's name of the last commit in the current branch if PR_NUMBER is set. diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 860c2eaf1..b07c008e9 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -42,11 +42,9 @@ from codeflash.code_utils.config_consts import ( COVERAGE_THRESHOLD, INDIVIDUAL_TESTCASE_TIMEOUT, - N_CANDIDATES_EFFECTIVE, - N_CANDIDATES_LP_EFFECTIVE, - N_TESTS_TO_GENERATE_EFFECTIVE, REPEAT_OPTIMIZATION_PROBABILITY, TOTAL_LOOPING_TIME_EFFECTIVE, + Effort, ) from codeflash.code_utils.deduplicate_code import normalize_code from codeflash.code_utils.edit_generated_tests import ( @@ -239,7 +237,7 @@ def __init__( self.function_benchmark_timings = function_benchmark_timings if function_benchmark_timings else {} self.total_benchmark_timings = total_benchmark_timings if total_benchmark_timings else {} self.replay_tests_dir = replay_tests_dir if replay_tests_dir else None - n_tests = N_TESTS_TO_GENERATE_EFFECTIVE + n_tests = Effort.get_number_of_generated_tests(args.effort) self.executor = concurrent.futures.ThreadPoolExecutor( max_workers=n_tests + 3 if self.experiment_id is None else n_tests + 4 ) @@ -287,7 +285,7 @@ def generate_and_instrument_tests( str, ]: """Generate and instrument tests for the function.""" - n_tests = N_TESTS_TO_GENERATE_EFFECTIVE + n_tests = Effort.get_number_of_generated_tests(self.args.effort) generated_test_paths = [ get_test_file_path( self.test_cfg.tests_root, self.function_to_optimize.function_name, test_index, test_type="unit" @@ -842,7 +840,7 @@ def determine_best_candidate( dependency_code=code_context.read_only_context_code, trace_id=self.get_trace_id(exp_type), line_profiler_results=original_code_baseline.line_profile_results["str_out"], - num_candidates=N_CANDIDATES_LP_EFFECTIVE, + num_candidates=Effort.get_number_of_optimizer_lp_candidates(self.args.effort), experiment_metadata=ExperimentMetadata( id=self.experiment_id, group="control" if exp_type == "EXP0" else "experiment" ) @@ -1211,7 +1209,7 @@ def generate_tests( generated_perf_test_paths: list[Path], ) -> Result[tuple[int, GeneratedTestsList, dict[str, set[FunctionCalledInTest]], str], str]: """Generate unit tests and concolic tests for the function.""" - n_tests = N_TESTS_TO_GENERATE_EFFECTIVE + n_tests = Effort.get_number_of_generated_tests(self.args.effort) assert len(generated_test_paths) == n_tests # Submit test generation tasks @@ -1273,7 +1271,7 @@ def generate_optimizations( run_experiment: bool = False, # noqa: FBT001, FBT002 ) -> Result[tuple[OptimizationSet, str], str]: """Generate optimization candidates for the function.""" - n_candidates = N_CANDIDATES_EFFECTIVE + n_candidates = Effort.get_number_of_optimizer_candidates(self.args.effort) future_optimization_candidates = self.executor.submit( self.aiservice_client.optimize_python_code, From 3e20a37f9426612d34ddaef136a4c4f98337922c Mon Sep 17 00:00:00 2001 From: ali Date: Fri, 19 Dec 2025 16:47:25 +0200 Subject: [PATCH 02/12] more effort values --- codeflash/code_utils/config_consts.py | 78 +++++++++++--------- codeflash/optimization/function_optimizer.py | 35 +++++---- 2 files changed, 62 insertions(+), 51 deletions(-) diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index 7cabe017b..e252fe9d1 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -1,4 +1,4 @@ -from enum import Enum +from enum import StrEnum, auto MAX_TEST_RUN_ITERATIONS = 5 INDIVIDUAL_TESTCASE_TIMEOUT = 15 @@ -13,18 +13,11 @@ REPEAT_OPTIMIZATION_PROBABILITY = 0.1 DEFAULT_IMPORTANCE_THRESHOLD = 0.001 -# Refinement -REFINE_ALL_THRESHOLD = 2 # when valid optimizations count is 2 or less, refine all optimizations REFINED_CANDIDATE_RANKING_WEIGHTS = (2, 1) # (runtime, diff), runtime is more important than diff by a factor of 2 -TOP_N_REFINEMENTS = 0.45 # top 45% of valid optimizations (based on the weighted score) are refined # LSP-specific TOTAL_LOOPING_TIME_LSP = 10.0 # Kept same timing for LSP mode to avoid in increase in performance reporting -# Code repair -REPAIR_UNMATCHED_PERCENTAGE_LIMIT = 0.4 # if the percentage of unmatched tests is greater than this, we won't fix it (lowering this value makes the repair more stricted) -MAX_REPAIRS_PER_TRACE = 4 # maximum number of repairs we will do for each function - try: from codeflash.lsp.helpers import is_LSP_enabled @@ -37,36 +30,49 @@ MAX_CONTEXT_LEN_REVIEW = 1000 -class EffortLevel(str, Enum): - LOW = "low" - MEDIUM = "medium" - HIGH = "high" +class EffortLevel(StrEnum): + LOW = auto() + MEDIUM = auto() + HIGH = auto() -class Effort: - @staticmethod - def get_number_of_optimizer_candidates(effort: str) -> int: - if effort == EffortLevel.LOW.value: - return 3 - if effort == EffortLevel.MEDIUM.value: - return 4 - if effort == EffortLevel.HIGH.value: - return 5 - msg = f"Invalid effort level: {effort}" - raise ValueError(msg) +class EffortKeys(StrEnum): + N_OPTIMIZER_CANDIDATES = auto() + N_OPTIMIZER_LP_CANDIDATES = auto() + N_GENERATED_TESTS = auto() + MAX_CODE_REPAIRS_PER_TRACE = auto() + REPAIR_UNMATCHED_PERCENTAGE_LIMIT = auto() + REFINE_ALL_THRESHOLD = auto() + TOP_VALID_CANDIDATES_FOR_REFINEMENT = auto() + - @staticmethod - def get_number_of_optimizer_lp_candidates(effort: str) -> int: - if effort == EffortLevel.LOW.value: - return 3 - if effort == EffortLevel.MEDIUM.value: - return 5 - if effort == EffortLevel.HIGH.value: - return 6 +EFFORT_VALUES: dict[str, dict[EffortLevel, any]] = { + EffortKeys.N_OPTIMIZER_CANDIDATES.value: {EffortLevel.LOW: 3, EffortLevel.MEDIUM: 4, EffortLevel.HIGH: 5}, + EffortKeys.N_OPTIMIZER_LP_CANDIDATES.value: {EffortLevel.LOW: 3, EffortLevel.MEDIUM: 5, EffortLevel.HIGH: 6}, + # we don't use effort with generated tests for now + EffortKeys.N_GENERATED_TESTS.value: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 2, EffortLevel.HIGH: 2}, + # maximum number of repairs we will do for each function + EffortKeys.MAX_CODE_REPAIRS_PER_TRACE.value: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 4, EffortLevel.HIGH: 5}, + # if the percentage of unmatched tests is greater than this, we won't fix it (lowering this value makes the repair more stricted) + # on the low effort we lower the limit to 20% to be more strict (less repairs) + EffortKeys.REPAIR_UNMATCHED_PERCENTAGE_LIMIT.value: { + EffortLevel.LOW: 0.2, + EffortLevel.MEDIUM: 0.4, + EffortLevel.HIGH: 0.5, + }, + # when valid optimizations count is N or less, refine all optimizations + EffortKeys.REFINE_ALL_THRESHOLD.value: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 3, EffortLevel.HIGH: 4}, + # Top valid candidates percentage for refinements + EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 3, EffortLevel.HIGH: 4}, +} + + +def get_effort_value(key: EffortKeys, effort: EffortLevel) -> any: + key_str = key.value + if key_str in EFFORT_VALUES: + if effort in EFFORT_VALUES[key_str]: + return EFFORT_VALUES[key_str][effort] msg = f"Invalid effort level: {effort}" raise ValueError(msg) - - @staticmethod - def get_number_of_generated_tests(effort: str) -> int: # noqa: ARG004 - # we don't use effort with generated tests for now - return 2 + msg = f"Invalid key: {key_str}" + raise ValueError(msg) diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 81c51fc3e..8771ff304 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -45,14 +45,11 @@ from codeflash.code_utils.config_consts import ( COVERAGE_THRESHOLD, INDIVIDUAL_TESTCASE_TIMEOUT, - MAX_REPAIRS_PER_TRACE, - REFINE_ALL_THRESHOLD, REFINED_CANDIDATE_RANKING_WEIGHTS, - REPAIR_UNMATCHED_PERCENTAGE_LIMIT, REPEAT_OPTIMIZATION_PROBABILITY, - TOP_N_REFINEMENTS, TOTAL_LOOPING_TIME_EFFECTIVE, - Effort, + EffortKeys, + get_effort_value, ) from codeflash.code_utils.deduplicate_code import normalize_code from codeflash.code_utils.edit_generated_tests import ( @@ -191,8 +188,16 @@ def refine_optimizations(self, request: list[AIServiceRefinerRequest]) -> concur def _process_refinement_results(self) -> OptimizedCandidate | None: """Process refinement results and add to queue. We generate a weighted ranking based on the runtime and diff lines and select the best (round of 45%) of valid optimizations to be refined.""" future_refinements: list[concurrent.futures.Future] = [] + top_n_candidates = int( + min( + get_effort_value(EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT, self.args.effort), + len(self.all_refinements_data), + ) + ) - if len(self.all_refinements_data) <= REFINE_ALL_THRESHOLD: + if top_n_candidates == len(self.all_refinements_data) or len(self.all_refinements_data) <= get_effort_value( + EffortKeys.REFINE_ALL_THRESHOLD, self.args.effort + ): for data in self.all_refinements_data: future_refinements.append(self.refine_optimizations([data])) # noqa: PERF401 else: @@ -209,7 +214,6 @@ def _process_refinement_results(self) -> OptimizedCandidate | None: diffs_norm = normalize_by_max(diff_lens_list) # the lower the better score_dict = create_score_dictionary_from_metrics(weights, runtime_norm, diffs_norm) - top_n_candidates = int((TOP_N_REFINEMENTS * len(runtimes_list)) + 0.5) top_indecies = sorted(score_dict, key=score_dict.get)[:top_n_candidates] for idx in top_indecies: @@ -310,7 +314,7 @@ def __init__( self.function_benchmark_timings = function_benchmark_timings if function_benchmark_timings else {} self.total_benchmark_timings = total_benchmark_timings if total_benchmark_timings else {} self.replay_tests_dir = replay_tests_dir if replay_tests_dir else None - n_tests = Effort.get_number_of_generated_tests(args.effort) + n_tests = get_effort_value(EffortKeys.N_GENERATED_TESTS, args.effort) self.executor = concurrent.futures.ThreadPoolExecutor( max_workers=n_tests + 3 if self.experiment_id is None else n_tests + 4 ) @@ -360,7 +364,7 @@ def generate_and_instrument_tests( str, ]: """Generate and instrument tests for the function.""" - n_tests = Effort.get_number_of_generated_tests(self.args.effort) + n_tests = get_effort_value(EffortKeys.N_GENERATED_TESTS, self.args.effort) generated_test_paths = [ get_test_file_path( self.test_cfg.tests_root, self.function_to_optimize.function_name, test_index, test_type="unit" @@ -925,7 +929,7 @@ def determine_best_candidate( dependency_code=code_context.read_only_context_code, trace_id=self.get_trace_id(exp_type), line_profiler_results=original_code_baseline.line_profile_results["str_out"], - num_candidates=Effort.get_number_of_optimizer_lp_candidates(self.args.effort), + num_candidates=get_effort_value(EffortKeys.N_OPTIMIZER_LP_CANDIDATES, self.args.effort), experiment_metadata=ExperimentMetadata( id=self.experiment_id, group="control" if exp_type == "EXP0" else "experiment" ) @@ -1290,7 +1294,7 @@ def generate_tests( generated_perf_test_paths: list[Path], ) -> Result[tuple[int, GeneratedTestsList, dict[str, set[FunctionCalledInTest]], str], str]: """Generate unit tests and concolic tests for the function.""" - n_tests = Effort.get_number_of_generated_tests(self.args.effort) + n_tests = get_effort_value(EffortKeys.N_GENERATED_TESTS, self.args.effort) assert len(generated_test_paths) == n_tests # Submit test generation tasks @@ -1352,7 +1356,7 @@ def generate_optimizations( run_experiment: bool = False, # noqa: FBT001, FBT002 ) -> Result[tuple[OptimizationSet, str], str]: """Generate optimization candidates for the function.""" - n_candidates = Effort.get_number_of_optimizer_candidates(self.args.effort) + n_candidates = get_effort_value(EffortKeys.N_OPTIMIZER_CANDIDATES, self.args.effort) future_optimization_candidates = self.executor.submit( self.aiservice_client.optimize_python_code, @@ -1919,8 +1923,9 @@ def repair_if_possible( test_results_count: int, exp_type: str, ) -> None: - if self.repair_counter >= MAX_REPAIRS_PER_TRACE: - logger.debug(f"Repair counter reached {MAX_REPAIRS_PER_TRACE}, skipping repair") + max_repairs = get_effort_value(EffortKeys.MAX_CODE_REPAIRS_PER_TRACE, self.args.effort) + if self.repair_counter >= max_repairs: + logger.debug(f"Repair counter reached {max_repairs}, skipping repair") return if candidate.source not in (OptimizedCandidateSource.OPTIMIZE, OptimizedCandidateSource.OPTIMIZE_LP): # only repair the first pass of the candidates for now @@ -1930,7 +1935,7 @@ def repair_if_possible( logger.debug("No diffs found, skipping repair") return result_unmatched_perc = len(diffs) / test_results_count - if result_unmatched_perc > REPAIR_UNMATCHED_PERCENTAGE_LIMIT: + if result_unmatched_perc > get_effort_value(EffortKeys.REPAIR_UNMATCHED_PERCENTAGE_LIMIT, self.args.effort): logger.debug(f"Result unmatched percentage is {result_unmatched_perc * 100}%, skipping repair") return From f4be23b6a95c391ac547c2bb8f4ff8652c324717 Mon Sep 17 00:00:00 2001 From: ali Date: Fri, 19 Dec 2025 17:01:44 +0200 Subject: [PATCH 03/12] fix --- codeflash/code_utils/config_consts.py | 2 +- codeflash/optimization/function_optimizer.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index e252fe9d1..885f48c57 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -62,7 +62,7 @@ class EffortKeys(StrEnum): }, # when valid optimizations count is N or less, refine all optimizations EffortKeys.REFINE_ALL_THRESHOLD.value: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 3, EffortLevel.HIGH: 4}, - # Top valid candidates percentage for refinements + # Top valid candidates for refinements EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 3, EffortLevel.HIGH: 4}, } diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 8771ff304..670a83d92 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -134,6 +134,7 @@ def __init__( ai_service_client: AiServiceClient, executor: concurrent.futures.ThreadPoolExecutor, future_all_code_repair: list[concurrent.futures.Future], + effort: str, ) -> None: self.candidate_queue = queue.Queue() self.line_profiler_done = False @@ -141,6 +142,7 @@ def __init__( self.candidate_len = len(initial_candidates) self.ai_service_client = ai_service_client self.executor = executor + self.effort = effort # Initialize queue with initial candidates for candidate in initial_candidates: @@ -190,13 +192,13 @@ def _process_refinement_results(self) -> OptimizedCandidate | None: future_refinements: list[concurrent.futures.Future] = [] top_n_candidates = int( min( - get_effort_value(EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT, self.args.effort), + get_effort_value(EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT, self.effort), len(self.all_refinements_data), ) ) if top_n_candidates == len(self.all_refinements_data) or len(self.all_refinements_data) <= get_effort_value( - EffortKeys.REFINE_ALL_THRESHOLD, self.args.effort + EffortKeys.REFINE_ALL_THRESHOLD, self.effort ): for data in self.all_refinements_data: future_refinements.append(self.refine_optimizations([data])) # noqa: PERF401 @@ -944,6 +946,7 @@ def determine_best_candidate( self.aiservice_client, self.executor, self.future_all_code_repair, + self.args.effort, ) candidate_index = 0 From 2f7fc605dc94376775f3fe4e37f189a4ae8c34f1 Mon Sep 17 00:00:00 2001 From: ali Date: Tue, 30 Dec 2025 23:08:39 +0200 Subject: [PATCH 04/12] set the right effort level for each case --- codeflash/code_utils/config_consts.py | 5 +---- codeflash/lsp/server.py | 2 ++ codeflash/optimization/function_optimizer.py | 7 +++---- codeflash/tracer.py | 2 ++ 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index 885f48c57..1f191e6d6 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -42,7 +42,6 @@ class EffortKeys(StrEnum): N_GENERATED_TESTS = auto() MAX_CODE_REPAIRS_PER_TRACE = auto() REPAIR_UNMATCHED_PERCENTAGE_LIMIT = auto() - REFINE_ALL_THRESHOLD = auto() TOP_VALID_CANDIDATES_FOR_REFINEMENT = auto() @@ -54,14 +53,12 @@ class EffortKeys(StrEnum): # maximum number of repairs we will do for each function EffortKeys.MAX_CODE_REPAIRS_PER_TRACE.value: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 4, EffortLevel.HIGH: 5}, # if the percentage of unmatched tests is greater than this, we won't fix it (lowering this value makes the repair more stricted) - # on the low effort we lower the limit to 20% to be more strict (less repairs) + # on the low effort we lower the limit to 20% to be more strict (less repairs, less time) EffortKeys.REPAIR_UNMATCHED_PERCENTAGE_LIMIT.value: { EffortLevel.LOW: 0.2, EffortLevel.MEDIUM: 0.4, EffortLevel.HIGH: 0.5, }, - # when valid optimizations count is N or less, refine all optimizations - EffortKeys.REFINE_ALL_THRESHOLD.value: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 3, EffortLevel.HIGH: 4}, # Top valid candidates for refinements EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 3, EffortLevel.HIGH: 4}, } diff --git a/codeflash/lsp/server.py b/codeflash/lsp/server.py index 582e5033c..6e97a9e15 100644 --- a/codeflash/lsp/server.py +++ b/codeflash/lsp/server.py @@ -7,6 +7,7 @@ from pygls.lsp.server import LanguageServer from pygls.protocol import LanguageServerProtocol +from codeflash.code_utils.config_consts import EffortLevel from codeflash.either import Result from codeflash.models.models import CodeOptimizationContext @@ -37,6 +38,7 @@ def prepare_optimizer_arguments(self, config_file: Path) -> None: args.config_file = config_file args.no_pr = True # LSP server should not create PRs args.worktree = True + args.effort = EffortLevel.LOW.value # low effort for high speed self.args = args # avoid initializing the optimizer during initialization, because it can cause an error if the api key is invalid diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 670a83d92..28f0998f0 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -192,14 +192,13 @@ def _process_refinement_results(self) -> OptimizedCandidate | None: future_refinements: list[concurrent.futures.Future] = [] top_n_candidates = int( min( - get_effort_value(EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT, self.effort), + int(get_effort_value(EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT, self.effort)), len(self.all_refinements_data), ) ) - if top_n_candidates == len(self.all_refinements_data) or len(self.all_refinements_data) <= get_effort_value( - EffortKeys.REFINE_ALL_THRESHOLD, self.effort - ): + if top_n_candidates == len(self.all_refinements_data): + # if we'll refine all candidates, we can skip the ranking and just refine them all for data in self.all_refinements_data: future_refinements.append(self.refine_optimizations([data])) # noqa: PERF401 else: diff --git a/codeflash/tracer.py b/codeflash/tracer.py index eb011befa..c26214f77 100644 --- a/codeflash/tracer.py +++ b/codeflash/tracer.py @@ -24,6 +24,7 @@ from codeflash.cli_cmds.console import console from codeflash.code_utils.code_utils import get_run_tmp_file from codeflash.code_utils.compat import SAFE_SYS_EXECUTABLE +from codeflash.code_utils.config_consts import EffortLevel from codeflash.code_utils.config_parser import parse_config_file from codeflash.tracing.pytest_parallelization import pytest_split @@ -214,6 +215,7 @@ def main(args: Namespace | None = None) -> ArgumentParser: from codeflash.optimization import optimizer + args.effort = EffortLevel.HIGH.value optimizer.run_with_args(args) # Delete the trace file and the replay test file if they exist From a126d9ef8978fca1a79f4701a8efb61f08a74547 Mon Sep 17 00:00:00 2001 From: ali Date: Tue, 6 Jan 2026 02:31:48 +0200 Subject: [PATCH 05/12] number of candidates for model distribution & control adaptive optimization params with effort --- codeflash/api/aiservice.py | 13 +++++++------ codeflash/code_utils/config_consts.py | 20 ++++++++++---------- codeflash/optimization/function_optimizer.py | 14 ++++++++------ 3 files changed, 25 insertions(+), 22 deletions(-) diff --git a/codeflash/api/aiservice.py b/codeflash/api/aiservice.py index 38e21f0a7..14e74ee36 100644 --- a/codeflash/api/aiservice.py +++ b/codeflash/api/aiservice.py @@ -16,7 +16,6 @@ from codeflash.code_utils.env_utils import get_codeflash_api_key from codeflash.code_utils.git_utils import get_last_commit_author_if_pr_exists, get_repo_owner_and_name from codeflash.code_utils.time_utils import humanize_runtime -from codeflash.lsp.helpers import is_LSP_enabled from codeflash.models.ExperimentMetadata import ExperimentMetadata from codeflash.models.models import ( AIServiceRefinerRequest, @@ -128,6 +127,7 @@ def optimize_python_code( # noqa: D417 experiment_metadata: ExperimentMetadata | None = None, *, is_async: bool = False, + n_candidates: int = 5, ) -> list[OptimizedCandidate]: """Optimize the given python code for performance by making a request to the Django endpoint. @@ -138,6 +138,7 @@ def optimize_python_code( # noqa: D417 - trace_id (str): Trace id of optimization run - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization - is_async (bool): Whether the function being optimized is async + - n_candidates (int): Number of candidates to generate Returns ------- @@ -160,10 +161,10 @@ def optimize_python_code( # noqa: D417 "repo_owner": git_repo_owner, "repo_name": git_repo_name, "is_async": is_async, - "lsp_mode": is_LSP_enabled(), "call_sequence": self.get_next_sequence(), + "n_candidates": n_candidates, } - logger.debug(f"Sending optimize request: trace_id={trace_id}, lsp_mode={payload['lsp_mode']}") + logger.debug(f"Sending optimize request: trace_id={trace_id}, n_candidates={payload['n_candidates']}") try: response = self.make_ai_service_request("/optimize", payload=payload, timeout=60) @@ -195,7 +196,7 @@ def optimize_python_code_line_profiler( # noqa: D417 dependency_code: str, trace_id: str, line_profiler_results: str, - num_candidates: int = 8, + n_candidates: int, experiment_metadata: ExperimentMetadata | None = None, ) -> list[OptimizedCandidate]: """Optimize the given python code for performance using line profiler results. @@ -207,6 +208,7 @@ def optimize_python_code_line_profiler( # noqa: D417 - trace_id (str): Trace id of optimization run - line_profiler_results (str): Line profiler output to guide optimization - experiment_metadata (Optional[ExperimentalMetadata, None]): Any available experiment metadata for this optimization + - n_candidates (int): Number of candidates to generate Returns ------- @@ -223,13 +225,12 @@ def optimize_python_code_line_profiler( # noqa: D417 payload = { "source_code": source_code, "dependency_code": dependency_code, - "n_candidates_lp": num_candidates, + "n_candidates": n_candidates, "line_profiler_results": line_profiler_results, "trace_id": trace_id, "python_version": platform.python_version(), "experiment_metadata": experiment_metadata, "codeflash_version": codeflash_version, - "lsp_mode": is_LSP_enabled(), "call_sequence": self.get_next_sequence(), } diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index 75949af92..19cdc56c4 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -25,14 +25,6 @@ # LSP-specific TOTAL_LOOPING_TIME_LSP = 10.0 # Kept same timing for LSP mode to avoid in increase in performance reporting -# Adaptive optimization -# TODO (ali): make this configurable with effort arg once the PR is merged -ADAPTIVE_OPTIMIZATION_THRESHOLD = 2 # Max adaptive optimizations per single candidate tree (for example : optimize -> refine -> adaptive -> another adaptive). -# MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE = 4 # maximum number of adaptive optimizations we will do for each function (this can be 2 adaptive optimizations for 2 candidates for example) -MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE = ( - 0 # disable adaptive optimizations until we have this value controlled by the effort arg -) - try: from codeflash.lsp.helpers import is_LSP_enabled @@ -58,11 +50,13 @@ class EffortKeys(StrEnum): MAX_CODE_REPAIRS_PER_TRACE = auto() REPAIR_UNMATCHED_PERCENTAGE_LIMIT = auto() TOP_VALID_CANDIDATES_FOR_REFINEMENT = auto() + ADAPTIVE_OPTIMIZATION_THRESHOLD = auto() + MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE = auto() EFFORT_VALUES: dict[str, dict[EffortLevel, any]] = { - EffortKeys.N_OPTIMIZER_CANDIDATES.value: {EffortLevel.LOW: 3, EffortLevel.MEDIUM: 4, EffortLevel.HIGH: 5}, - EffortKeys.N_OPTIMIZER_LP_CANDIDATES.value: {EffortLevel.LOW: 3, EffortLevel.MEDIUM: 5, EffortLevel.HIGH: 6}, + EffortKeys.N_OPTIMIZER_CANDIDATES.value: {EffortLevel.LOW: 3, EffortLevel.MEDIUM: 5, EffortLevel.HIGH: 6}, + EffortKeys.N_OPTIMIZER_LP_CANDIDATES.value: {EffortLevel.LOW: 4, EffortLevel.MEDIUM: 6, EffortLevel.HIGH: 7}, # we don't use effort with generated tests for now EffortKeys.N_GENERATED_TESTS.value: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 2, EffortLevel.HIGH: 2}, # maximum number of repairs we will do for each function @@ -76,6 +70,12 @@ class EffortKeys(StrEnum): }, # Top valid candidates for refinements EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 3, EffortLevel.HIGH: 4}, + EffortKeys.ADAPTIVE_OPTIMIZATION_THRESHOLD.value: {EffortLevel.LOW: 0, EffortLevel.MEDIUM: 1, EffortLevel.HIGH: 3}, + EffortKeys.MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE.value: { + EffortLevel.LOW: 0, + EffortLevel.MEDIUM: 3, + EffortLevel.HIGH: 10, + }, } diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 34db058ef..8cfed2614 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -43,10 +43,8 @@ unified_diff_strings, ) from codeflash.code_utils.config_consts import ( - ADAPTIVE_OPTIMIZATION_THRESHOLD, COVERAGE_THRESHOLD, INDIVIDUAL_TESTCASE_TIMEOUT, - MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE, REFINED_CANDIDATE_RANKING_WEIGHTS, REPEAT_OPTIMIZATION_PROBABILITY, TOTAL_LOOPING_TIME_EFFECTIVE, @@ -1018,7 +1016,7 @@ def determine_best_candidate( dependency_code=code_context.read_only_context_code, trace_id=self.get_trace_id(exp_type), line_profiler_results=original_code_baseline.line_profile_results["str_out"], - num_candidates=get_effort_value(EffortKeys.N_OPTIMIZER_LP_CANDIDATES, self.args.effort), + n_candidates=get_effort_value(EffortKeys.N_OPTIMIZER_LP_CANDIDATES, self.args.effort), experiment_metadata=ExperimentMetadata( id=self.experiment_id, group="control" if exp_type == "EXP0" else "experiment" ) @@ -1097,7 +1095,9 @@ def call_adaptive_optimize( eval_ctx: CandidateEvaluationContext, ai_service_client: AiServiceClient, ) -> concurrent.futures.Future[OptimizedCandidate | None] | None: - if self.adaptive_optimization_counter >= MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE: + if self.adaptive_optimization_counter >= get_effort_value( + EffortKeys.MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE, self.args.effort + ): logger.debug( f"Max adaptive optimizations reached for {self.function_to_optimize.qualified_name}: {self.adaptive_optimization_counter}" ) @@ -1105,7 +1105,7 @@ def call_adaptive_optimize( adaptive_count = sum(1 for c in prev_candidates if c.source == OptimizedCandidateSource.ADAPTIVE) - if adaptive_count >= ADAPTIVE_OPTIMIZATION_THRESHOLD: + if adaptive_count >= get_effort_value(EffortKeys.ADAPTIVE_OPTIMIZATION_THRESHOLD, self.args.effort): return None request_candidates = [] @@ -1492,7 +1492,7 @@ def generate_optimizations( run_experiment: bool = False, # noqa: FBT001, FBT002 ) -> Result[tuple[OptimizationSet, str], str]: """Generate optimization candidates for the function. Backend handles multi-model diversity.""" - # n_candidates = get_effort_value(EffortKeys.N_OPTIMIZER_CANDIDATES, self.args.effort) + n_candidates = get_effort_value(EffortKeys.N_OPTIMIZER_CANDIDATES, self.args.effort) future_optimization_candidates = self.executor.submit( self.aiservice_client.optimize_python_code, read_writable_code.markdown, @@ -1500,6 +1500,7 @@ def generate_optimizations( self.function_trace_id[:-4] + "EXP0" if run_experiment else self.function_trace_id, ExperimentMetadata(id=self.experiment_id, group="control") if run_experiment else None, is_async=self.function_to_optimize.is_async, + n_candidates=n_candidates, ) future_references = self.executor.submit( @@ -1522,6 +1523,7 @@ def generate_optimizations( self.function_trace_id[:-4] + "EXP1", ExperimentMetadata(id=self.experiment_id, group="experiment"), is_async=self.function_to_optimize.is_async, + n_candidates=n_candidates, ) futures.append(future_candidates_exp) From 18e0b249c522aa83cce6caf92189d658594d3c40 Mon Sep 17 00:00:00 2001 From: ali Date: Tue, 6 Jan 2026 03:50:43 +0200 Subject: [PATCH 06/12] default effort value for function optimizer --- codeflash/optimization/function_optimizer.py | 24 ++++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 8cfed2614..a253acefb 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -49,6 +49,7 @@ REPEAT_OPTIMIZATION_PROBABILITY, TOTAL_LOOPING_TIME_EFFECTIVE, EffortKeys, + EffortLevel, get_effort_value, ) from codeflash.code_utils.deduplicate_code import normalize_code @@ -375,6 +376,9 @@ def __init__( self.experiment_id = os.getenv("CODEFLASH_EXPERIMENT_ID", None) self.local_aiservice_client = LocalAiServiceClient() if self.experiment_id else None self.test_files = TestFiles(test_files=[]) + + self.effort = getattr(args, "effort", EffortLevel.MEDIUM.value) if args else EffortLevel.MEDIUM.value + self.args = args # Check defaults for these self.function_trace_id: str = str(uuid.uuid4()) self.original_module_path = module_name_from_file_path(self.function_to_optimize.file_path, self.project_root) @@ -382,7 +386,7 @@ def __init__( self.function_benchmark_timings = function_benchmark_timings if function_benchmark_timings else {} self.total_benchmark_timings = total_benchmark_timings if total_benchmark_timings else {} self.replay_tests_dir = replay_tests_dir if replay_tests_dir else None - n_tests = get_effort_value(EffortKeys.N_GENERATED_TESTS, args.effort) + n_tests = get_effort_value(EffortKeys.N_GENERATED_TESTS, self.effort) self.executor = concurrent.futures.ThreadPoolExecutor( max_workers=n_tests + 3 if self.experiment_id is None else n_tests + 4 ) @@ -434,7 +438,7 @@ def generate_and_instrument_tests( str, ]: """Generate and instrument tests for the function.""" - n_tests = get_effort_value(EffortKeys.N_GENERATED_TESTS, self.args.effort) + n_tests = get_effort_value(EffortKeys.N_GENERATED_TESTS, self.effort) generated_test_paths = [ get_test_file_path( self.test_cfg.tests_root, self.function_to_optimize.function_name, test_index, test_type="unit" @@ -1016,7 +1020,7 @@ def determine_best_candidate( dependency_code=code_context.read_only_context_code, trace_id=self.get_trace_id(exp_type), line_profiler_results=original_code_baseline.line_profile_results["str_out"], - n_candidates=get_effort_value(EffortKeys.N_OPTIMIZER_LP_CANDIDATES, self.args.effort), + n_candidates=get_effort_value(EffortKeys.N_OPTIMIZER_LP_CANDIDATES, self.effort), experiment_metadata=ExperimentMetadata( id=self.experiment_id, group="control" if exp_type == "EXP0" else "experiment" ) @@ -1031,7 +1035,7 @@ def determine_best_candidate( self.aiservice_client, self.executor, self.future_all_code_repair, - self.args.effort, + self.effort, self.future_adaptive_optimizations, ) candidate_index = 0 @@ -1096,7 +1100,7 @@ def call_adaptive_optimize( ai_service_client: AiServiceClient, ) -> concurrent.futures.Future[OptimizedCandidate | None] | None: if self.adaptive_optimization_counter >= get_effort_value( - EffortKeys.MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE, self.args.effort + EffortKeys.MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE, self.effort ): logger.debug( f"Max adaptive optimizations reached for {self.function_to_optimize.qualified_name}: {self.adaptive_optimization_counter}" @@ -1105,7 +1109,7 @@ def call_adaptive_optimize( adaptive_count = sum(1 for c in prev_candidates if c.source == OptimizedCandidateSource.ADAPTIVE) - if adaptive_count >= get_effort_value(EffortKeys.ADAPTIVE_OPTIMIZATION_THRESHOLD, self.args.effort): + if adaptive_count >= get_effort_value(EffortKeys.ADAPTIVE_OPTIMIZATION_THRESHOLD, self.effort): return None request_candidates = [] @@ -1425,7 +1429,7 @@ def generate_tests( generated_perf_test_paths: list[Path], ) -> Result[tuple[int, GeneratedTestsList, dict[str, set[FunctionCalledInTest]], str], str]: """Generate unit tests and concolic tests for the function.""" - n_tests = get_effort_value(EffortKeys.N_GENERATED_TESTS, self.args.effort) + n_tests = get_effort_value(EffortKeys.N_GENERATED_TESTS, self.effort) assert len(generated_test_paths) == n_tests if not self.args.no_gen_tests: @@ -1492,7 +1496,7 @@ def generate_optimizations( run_experiment: bool = False, # noqa: FBT001, FBT002 ) -> Result[tuple[OptimizationSet, str], str]: """Generate optimization candidates for the function. Backend handles multi-model diversity.""" - n_candidates = get_effort_value(EffortKeys.N_OPTIMIZER_CANDIDATES, self.args.effort) + n_candidates = get_effort_value(EffortKeys.N_OPTIMIZER_CANDIDATES, self.effort) future_optimization_candidates = self.executor.submit( self.aiservice_client.optimize_python_code, read_writable_code.markdown, @@ -2059,7 +2063,7 @@ def repair_if_possible( test_results_count: int, exp_type: str, ) -> None: - max_repairs = get_effort_value(EffortKeys.MAX_CODE_REPAIRS_PER_TRACE, self.args.effort) + max_repairs = get_effort_value(EffortKeys.MAX_CODE_REPAIRS_PER_TRACE, self.effort) if self.repair_counter >= max_repairs: logger.debug(f"Repair counter reached {max_repairs}, skipping repair") return @@ -2071,7 +2075,7 @@ def repair_if_possible( logger.debug("No diffs found, skipping repair") return result_unmatched_perc = len(diffs) / test_results_count - if result_unmatched_perc > get_effort_value(EffortKeys.REPAIR_UNMATCHED_PERCENTAGE_LIMIT, self.args.effort): + if result_unmatched_perc > get_effort_value(EffortKeys.REPAIR_UNMATCHED_PERCENTAGE_LIMIT, self.effort): logger.debug(f"Result unmatched percentage is {result_unmatched_perc * 100}%, skipping repair") return From 8afe34fc59a62d5321449ef2dfdbf31686341472 Mon Sep 17 00:00:00 2001 From: ali Date: Tue, 6 Jan 2026 04:07:04 +0200 Subject: [PATCH 07/12] fix enum python issue --- codeflash/code_utils/config_consts.py | 33 ++++++++++++++------------- 1 file changed, 17 insertions(+), 16 deletions(-) diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index 19cdc56c4..9257cf040 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -1,4 +1,5 @@ -from enum import StrEnum, auto +from enum import Enum +from typing import Any MAX_TEST_RUN_ITERATIONS = 5 INDIVIDUAL_TESTCASE_TIMEOUT = 15 @@ -37,24 +38,24 @@ MAX_CONTEXT_LEN_REVIEW = 1000 -class EffortLevel(StrEnum): - LOW = auto() - MEDIUM = auto() - HIGH = auto() +class EffortLevel(str, Enum): + LOW = "low" + MEDIUM = "medium" + HIGH = "high" -class EffortKeys(StrEnum): - N_OPTIMIZER_CANDIDATES = auto() - N_OPTIMIZER_LP_CANDIDATES = auto() - N_GENERATED_TESTS = auto() - MAX_CODE_REPAIRS_PER_TRACE = auto() - REPAIR_UNMATCHED_PERCENTAGE_LIMIT = auto() - TOP_VALID_CANDIDATES_FOR_REFINEMENT = auto() - ADAPTIVE_OPTIMIZATION_THRESHOLD = auto() - MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE = auto() +class EffortKeys(str, Enum): + N_OPTIMIZER_CANDIDATES = "N_OPTIMIZER_CANDIDATES" + N_OPTIMIZER_LP_CANDIDATES = "N_OPTIMIZER_LP_CANDIDATES" + N_GENERATED_TESTS = "N_GENERATED_TESTS" + MAX_CODE_REPAIRS_PER_TRACE = "MAX_CODE_REPAIRS_PER_TRACE" + REPAIR_UNMATCHED_PERCENTAGE_LIMIT = "REPAIR_UNMATCHED_PERCENTAGE_LIMIT" + TOP_VALID_CANDIDATES_FOR_REFINEMENT = "TOP_VALID_CANDIDATES_FOR_REFINEMENT" + ADAPTIVE_OPTIMIZATION_THRESHOLD = "ADAPTIVE_OPTIMIZATION_THRESHOLD" + MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE = "MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE" -EFFORT_VALUES: dict[str, dict[EffortLevel, any]] = { +EFFORT_VALUES: dict[str, dict[EffortLevel, Any]] = { EffortKeys.N_OPTIMIZER_CANDIDATES.value: {EffortLevel.LOW: 3, EffortLevel.MEDIUM: 5, EffortLevel.HIGH: 6}, EffortKeys.N_OPTIMIZER_LP_CANDIDATES.value: {EffortLevel.LOW: 4, EffortLevel.MEDIUM: 6, EffortLevel.HIGH: 7}, # we don't use effort with generated tests for now @@ -79,7 +80,7 @@ class EffortKeys(StrEnum): } -def get_effort_value(key: EffortKeys, effort: EffortLevel) -> any: +def get_effort_value(key: EffortKeys, effort: EffortLevel) -> Any: # noqa: ANN401 key_str = key.value if key_str in EFFORT_VALUES: if effort in EFFORT_VALUES[key_str]: From 54cf458f950f5fdc2b054d5235d3f672d5681a1d Mon Sep 17 00:00:00 2001 From: mohammed ahmed <64513301+mohammedahmed18@users.noreply.github.com> Date: Tue, 6 Jan 2026 23:42:34 +0000 Subject: [PATCH 08/12] merge main into optimization-effort --- codeflash/code_utils/config_consts.py | 3 + codeflash/context/code_context_extractor.py | 57 ++++-- .../context/unused_definition_remover.py | 12 ++ codeflash/models/models.py | 3 + codeflash/optimization/function_optimizer.py | 164 +++++++++-------- tests/test_code_context_extractor.py | 170 +++++++++++++++++- tests/test_instrument_line_profiler.py | 6 +- tests/test_remove_unused_definitions.py | 59 ++++++ 8 files changed, 382 insertions(+), 92 deletions(-) diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index 9257cf040..f3a881e1b 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -26,6 +26,9 @@ # LSP-specific TOTAL_LOOPING_TIME_LSP = 10.0 # Kept same timing for LSP mode to avoid in increase in performance reporting +# setting this value to 1 will disable repair if there is at least one correct candidate +MIN_CORRECT_CANDIDATES = 2 + try: from codeflash.lsp.helpers import is_LSP_enabled diff --git a/codeflash/context/code_context_extractor.py b/codeflash/context/code_context_extractor.py index 14d549633..a411bafac 100644 --- a/codeflash/context/code_context_extractor.py +++ b/codeflash/context/code_context_extractor.py @@ -446,31 +446,45 @@ def get_function_sources_from_jedi( definition_path = definition.module_path # The definition is part of this project and not defined within the original function - if ( + is_valid_definition = ( str(definition_path).startswith(str(project_root_path) + os.sep) and not path_belongs_to_site_packages(definition_path) and definition.full_name - and definition.type == "function" and not belongs_to_function_qualified(definition, qualified_function_name) and definition.full_name.startswith(definition.module_name) + ) + if is_valid_definition and definition.type == "function": + qualified_name = get_qualified_name(definition.module_name, definition.full_name) # Avoid nested functions or classes. Only class.function is allowed - and len( - (qualified_name := get_qualified_name(definition.module_name, definition.full_name)).split( - "." + if len(qualified_name.split(".")) <= 2: + function_source = FunctionSource( + file_path=definition_path, + qualified_name=qualified_name, + fully_qualified_name=definition.full_name, + only_function_name=definition.name, + source_code=definition.get_line_code(), + jedi_definition=definition, ) + file_path_to_function_source[definition_path].add(function_source) + function_source_list.append(function_source) + # When a class is instantiated (e.g., MyClass()), track its __init__ as a helper + # This ensures the class definition with constructor is included in testgen context + elif is_valid_definition and definition.type == "class": + init_qualified_name = get_qualified_name( + definition.module_name, f"{definition.full_name}.__init__" ) - <= 2 - ): - function_source = FunctionSource( - file_path=definition_path, - qualified_name=qualified_name, - fully_qualified_name=definition.full_name, - only_function_name=definition.name, - source_code=definition.get_line_code(), - jedi_definition=definition, - ) - file_path_to_function_source[definition_path].add(function_source) - function_source_list.append(function_source) + # Only include if it's a top-level class (not nested) + if len(init_qualified_name.split(".")) <= 2: + function_source = FunctionSource( + file_path=definition_path, + qualified_name=init_qualified_name, + fully_qualified_name=f"{definition.full_name}.__init__", + only_function_name="__init__", + source_code=definition.get_line_code(), + jedi_definition=definition, + ) + file_path_to_function_source[definition_path].add(function_source) + function_source_list.append(function_source) return file_path_to_function_source, function_source_list @@ -647,7 +661,10 @@ def prune_cst_for_code_hashing( # noqa: PLR0911 if isinstance(node, cst.FunctionDef): qualified_name = f"{prefix}.{node.name.value}" if prefix else node.name.value - if qualified_name in target_functions: + # For hashing, exclude __init__ methods even if in target_functions + # because they don't affect the semantic behavior being hashed + # But include other dunder methods like __call__ which do affect behavior + if qualified_name in target_functions and node.name.value != "__init__": new_body = remove_docstring_from_body(node.body) if isinstance(node.body, cst.IndentedBlock) else node.body return node.with_changes(body=new_body), True return None, False @@ -666,7 +683,9 @@ def prune_cst_for_code_hashing( # noqa: PLR0911 for stmt in node.body.body: if isinstance(stmt, cst.FunctionDef): qualified_name = f"{class_prefix}.{stmt.name.value}" - if qualified_name in target_functions: + # For hashing, exclude __init__ methods even if in target_functions + # but include other methods like __call__ which affect behavior + if qualified_name in target_functions and stmt.name.value != "__init__": stmt_with_changes = stmt.with_changes( body=remove_docstring_from_body(cast("cst.IndentedBlock", stmt.body)) ) diff --git a/codeflash/context/unused_definition_remover.py b/codeflash/context/unused_definition_remover.py index 8e6ea057c..823cb735b 100644 --- a/codeflash/context/unused_definition_remover.py +++ b/codeflash/context/unused_definition_remover.py @@ -223,6 +223,18 @@ def visit_ClassDef(self, node: cst.ClassDef) -> None: self.current_class = class_name self.current_top_level_name = class_name + # Track base classes as dependencies + for base in node.bases: + if isinstance(base.value, cst.Name): + base_name = base.value.value + if base_name in self.definitions and class_name in self.definitions: + self.definitions[class_name].dependencies.add(base_name) + elif isinstance(base.value, cst.Attribute): + # Handle cases like module.ClassName + attr_name = base.value.attr.value + if attr_name in self.definitions and class_name in self.definitions: + self.definitions[class_name].dependencies.add(attr_name) + self.class_depth += 1 def leave_ClassDef(self, original_node: cst.ClassDef) -> None: # noqa: ARG002 diff --git a/codeflash/models/models.py b/codeflash/models/models.py index 1af946da4..844ff9603 100644 --- a/codeflash/models/models.py +++ b/codeflash/models/models.py @@ -463,6 +463,9 @@ def register_new_candidate( def get_speedup_ratio(self, optimization_id: str) -> float | None: return self.speedup_ratios.get(optimization_id) + def get_optimized_runtime(self, optimization_id: str) -> float | None: + return self.optimized_runtimes.get(optimization_id) + @dataclass(frozen=True) class TestsInFile: diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index a253acefb..0b525e3e7 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -45,6 +45,7 @@ from codeflash.code_utils.config_consts import ( COVERAGE_THRESHOLD, INDIVIDUAL_TESTCASE_TIMEOUT, + MIN_CORRECT_CANDIDATES, REFINED_CANDIDATE_RANKING_WEIGHTS, REPEAT_OPTIMIZATION_PROBABILITY, TOTAL_LOOPING_TIME_EFFECTIVE, @@ -181,11 +182,11 @@ def __init__( self, initial_candidates: list[OptimizedCandidate], future_line_profile_results: concurrent.futures.Future, - all_refinements_data: list[AIServiceRefinerRequest], ai_service_client: AiServiceClient, - executor: concurrent.futures.ThreadPoolExecutor, + eval_ctx: CandidateEvaluationContext, + original_markdown_code: str, + future_all_refinements: list[concurrent.futures.Future], future_all_code_repair: list[concurrent.futures.Future], - effort: str, future_adaptive_optimizations: list[concurrent.futures.Future], ) -> None: self.candidate_queue = queue.Queue() @@ -194,9 +195,9 @@ def __init__( self.refinement_done = False self.candidate_len = len(initial_candidates) self.ai_service_client = ai_service_client - self.executor = executor - self.effort = effort self.refinement_calls_count = 0 + self.original_markdown_code = original_markdown_code + self.eval_ctx = eval_ctx # Initialize queue with initial candidates for candidate in initial_candidates: @@ -204,7 +205,7 @@ def __init__( self.candidate_queue.put(candidate) self.future_line_profile_results = future_line_profile_results - self.all_refinements_data = all_refinements_data + self.future_all_refinements = future_all_refinements self.future_all_code_repair = future_all_code_repair self.future_adaptive_optimizations = future_adaptive_optimizations @@ -235,7 +236,13 @@ def _handle_empty_queue(self) -> CandidateNode | None: lambda: self.future_all_code_repair.clear(), ) if self.line_profiler_done and not self.refinement_done: - return self._process_refinement_results() + return self._process_candidates( + self.future_all_refinements, + "Refining generated code for improved quality and performance...", + "Added {0} candidates from refinement, total candidates now: {1}", + lambda: setattr(self, "refinement_done", True), + filter_candidates_func=self._filter_refined_candidates, + ) if len(self.future_adaptive_optimizations) > 0: return self._process_candidates( self.future_adaptive_optimizations, @@ -251,6 +258,7 @@ def _process_candidates( loading_msg: str, success_msg: str, callback: Callable[[], None], + filter_candidates_func: Callable[[list[OptimizedCandidate]], list[OptimizedCandidate]] | None = None, ) -> CandidateNode | None: if len(future_candidates) == 0: return None @@ -269,6 +277,7 @@ def _process_candidates( else: candidates.append(candidate_result) + candidates = filter_candidates_func(candidates) if filter_candidates_func else candidates for candidate in candidates: self.forest.add(candidate) self.candidate_queue.put(candidate) @@ -280,55 +289,50 @@ def _process_candidates( callback() return self.get_next_candidate() - def refine_optimizations(self, request: list[AIServiceRefinerRequest]) -> concurrent.futures.Future: - return self.executor.submit(self.ai_service_client.optimize_python_code_refinement, request=request) + def _filter_refined_candidates(self, candidates: list[OptimizedCandidate]) -> list[OptimizedCandidate]: + """We generate a weighted ranking based on the runtime and diff lines and select the best of valid optimizations to be tested.""" + self.refinement_calls_count += len(candidates) - def _process_refinement_results(self) -> CandidateNode | None: - """Process refinement results and add to queue. We generate a weighted ranking based on the runtime and diff lines and select the best (round of 45%) of valid optimizations to be refined.""" - future_refinements: list[concurrent.futures.Future] = [] top_n_candidates = int( min( int(get_effort_value(EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT, self.effort)), len(self.all_refinements_data), ) ) - refinement_call_index = 0 - if top_n_candidates == len(self.all_refinements_data): - # if we'll refine all candidates, we can skip the ranking and just refine them all - for data in self.all_refinements_data: - refinement_call_index += 1 - future_refinements.append(self.refine_optimizations([data])) - else: - diff_lens_list = [] - runtimes_list = [] - for c in self.all_refinements_data: - diff_lens_list.append(diff_length(c.original_source_code, c.optimized_source_code)) - runtimes_list.append(c.optimized_code_runtime) - - runtime_w, diff_w = REFINED_CANDIDATE_RANKING_WEIGHTS - weights = choose_weights(runtime=runtime_w, diff=diff_w) - - runtime_norm = normalize_by_max(runtimes_list) - diffs_norm = normalize_by_max(diff_lens_list) - # the lower the better - score_dict = create_score_dictionary_from_metrics(weights, runtime_norm, diffs_norm) - top_indecies = sorted(score_dict, key=score_dict.get)[:top_n_candidates] - - for idx in top_indecies: - refinement_call_index += 1 - data = self.all_refinements_data[idx] - future_refinements.append(self.refine_optimizations([data])) - - # Track total refinement calls made - self.refinement_calls_count = refinement_call_index - - return self._process_candidates( - future_refinements, - "Refining generated code for improved quality and performance...", - "Added {0} candidates from refinement, total candidates now: {1}", - lambda: setattr(self, "refinement_done", True), - ) + if len(candidates) == top_n_candidates: + # no need for ranking since we will return all candidates + return candidates + + diff_lens_list = [] + runtimes_list = [] + for c in candidates: + # current refined candidates is not benchmarked yet, a close values we would expect to be the parent candidate + parent_id = c.parent_id + parent_candidate_node = self.forest.get_node(parent_id) + parent_optimized_runtime = self.eval_ctx.get_optimized_runtime(parent_id) + if not parent_optimized_runtime or not parent_candidate_node: + continue + diff_lens_list.append( + diff_length(self.original_markdown_code, parent_candidate_node.candidate.source_code.markdown) + ) + runtimes_list.append(parent_optimized_runtime) + + if not runtimes_list or not diff_lens_list: + # should not happen + logger.warning("No valid candidates for refinement while filtering") + return candidates + + runtime_w, diff_w = REFINED_CANDIDATE_RANKING_WEIGHTS + weights = choose_weights(runtime=runtime_w, diff=diff_w) + + runtime_norm = normalize_by_max(runtimes_list) + diffs_norm = normalize_by_max(diff_lens_list) + # the lower the better + score_dict = create_score_dictionary_from_metrics(weights, runtime_norm, diffs_norm) + top_indecies = sorted(score_dict, key=score_dict.get)[:top_n_candidates] + + return [candidates[idx] for idx in top_indecies] def is_done(self) -> bool: """Check if processing is complete.""" @@ -392,6 +396,7 @@ def __init__( ) self.optimization_review = "" self.future_all_code_repair: list[concurrent.futures.Future] = [] + self.future_all_refinements: list[concurrent.futures.Future] = [] self.future_adaptive_optimizations: list[concurrent.futures.Future] = [] self.repair_counter = 0 # track how many repairs we did for each function self.adaptive_optimization_counter = 0 # track how many adaptive optimizations we did for each function @@ -838,7 +843,6 @@ def process_single_candidate( original_helper_code: dict[Path, str], file_path_to_helper_classes: dict[Path, set[str]], eval_ctx: CandidateEvaluationContext, - all_refinements_data: list[AIServiceRefinerRequest], exp_type: str, function_references: str, ) -> BestOptimization | None: @@ -894,6 +898,7 @@ def process_single_candidate( baseline_results=original_code_baseline, original_helper_code=original_helper_code, file_path_to_helper_classes=file_path_to_helper_classes, + eval_ctx=eval_ctx, code_context=code_context, candidate=candidate, exp_type=exp_type, @@ -947,33 +952,40 @@ def process_single_candidate( c.source == OptimizedCandidateSource.REFINE for c in current_tree_candidates ) + aiservice_client = self.aiservice_client if exp_type == "EXP0" else self.local_aiservice_client + if is_candidate_refined_before: future_adaptive_optimization = self.call_adaptive_optimize( trace_id=self.get_trace_id(exp_type), original_source_code=code_context.read_writable_code.markdown, prev_candidates=current_tree_candidates, eval_ctx=eval_ctx, - ai_service_client=self.aiservice_client if exp_type == "EXP0" else self.local_aiservice_client, + ai_service_client=aiservice_client, ) if future_adaptive_optimization: self.future_adaptive_optimizations.append(future_adaptive_optimization) else: - all_refinements_data.append( - AIServiceRefinerRequest( - optimization_id=best_optimization.candidate.optimization_id, - original_source_code=code_context.read_writable_code.markdown, - read_only_dependency_code=code_context.read_only_context_code, - original_code_runtime=original_code_baseline.runtime, - optimized_source_code=best_optimization.candidate.source_code.markdown, - optimized_explanation=best_optimization.candidate.explanation, - optimized_code_runtime=best_optimization.runtime, - speedup=f"{int(performance_gain(original_runtime_ns=original_code_baseline.runtime, optimized_runtime_ns=best_optimization.runtime) * 100)}%", - trace_id=self.get_trace_id(exp_type), - original_line_profiler_results=original_code_baseline.line_profile_results["str_out"], - optimized_line_profiler_results=best_optimization.line_profiler_test_results["str_out"], - function_references=function_references, - ) + future_refinement = self.executor.submit( + aiservice_client.optimize_python_code_refinement, + request=[ + AIServiceRefinerRequest( + optimization_id=best_optimization.candidate.optimization_id, + original_source_code=code_context.read_writable_code.markdown, + read_only_dependency_code=code_context.read_only_context_code, + original_code_runtime=original_code_baseline.runtime, + optimized_source_code=best_optimization.candidate.source_code.markdown, + optimized_explanation=best_optimization.candidate.explanation, + optimized_code_runtime=best_optimization.runtime, + speedup=f"{int(performance_gain(original_runtime_ns=original_code_baseline.runtime, optimized_runtime_ns=best_optimization.runtime) * 100)}%", + trace_id=self.get_trace_id(exp_type), + original_line_profiler_results=original_code_baseline.line_profile_results["str_out"], + optimized_line_profiler_results=best_optimization.line_profiler_test_results["str_out"], + function_references=function_references, + ) + ], ) + self.future_all_refinements.append(future_refinement) + # Display runtime information if is_LSP_enabled(): lsp_log(LspMarkdownMessage(markdown=tree_to_markdown(tree))) @@ -1005,9 +1017,11 @@ def determine_best_candidate( # Initialize evaluation context and async tasks eval_ctx = CandidateEvaluationContext() - all_refinements_data: list[AIServiceRefinerRequest] = [] + + self.future_all_refinements.clear() self.future_all_code_repair.clear() self.future_adaptive_optimizations.clear() + self.repair_counter = 0 self.adaptive_optimization_counter = 0 @@ -1031,9 +1045,10 @@ def determine_best_candidate( processor = CandidateProcessor( candidates, future_line_profile_results, - all_refinements_data, self.aiservice_client, - self.executor, + eval_ctx, + code_context.read_writable_code.markdown, + self.future_all_refinements, self.future_all_code_repair, self.effort, self.future_adaptive_optimizations, @@ -1058,7 +1073,6 @@ def determine_best_candidate( original_helper_code=original_helper_code, file_path_to_helper_classes=file_path_to_helper_classes, eval_ctx=eval_ctx, - all_refinements_data=all_refinements_data, exp_type=exp_type, function_references=function_references, ) @@ -2059,6 +2073,7 @@ def repair_if_possible( self, candidate: OptimizedCandidate, diffs: list[TestDiff], + eval_ctx: CandidateEvaluationContext, code_context: CodeOptimizationContext, test_results_count: int, exp_type: str, @@ -2067,6 +2082,12 @@ def repair_if_possible( if self.repair_counter >= max_repairs: logger.debug(f"Repair counter reached {max_repairs}, skipping repair") return + + successful_candidates_count = sum(1 for is_correct in eval_ctx.is_correct.values() if is_correct) + if successful_candidates_count >= MIN_CORRECT_CANDIDATES: + logger.debug(f"{successful_candidates_count} of the candidates were correct, no need to repair") + return + if candidate.source not in (OptimizedCandidateSource.OPTIMIZE, OptimizedCandidateSource.OPTIMIZE_LP): # only repair the first pass of the candidates for now logger.debug(f"Candidate is a result of {candidate.source.value}, skipping repair") @@ -2104,6 +2125,7 @@ def run_optimized_candidate( baseline_results: OriginalCodeBaseline, original_helper_code: dict[Path, str], file_path_to_helper_classes: dict[Path, set[str]], + eval_ctx: CandidateEvaluationContext, code_context: CodeOptimizationContext, candidate: OptimizedCandidate, exp_type: str, @@ -2159,7 +2181,9 @@ def run_optimized_candidate( logger.info("h3|Test results matched ✅") console.rule() else: - self.repair_if_possible(candidate, diffs, code_context, len(candidate_behavior_results), exp_type) + self.repair_if_possible( + candidate, diffs, eval_ctx, code_context, len(candidate_behavior_results), exp_type + ) return self.get_results_not_matched_error() logger.info(f"loading|Running performance tests for candidate {optimization_candidate_index}...") diff --git a/tests/test_code_context_extractor.py b/tests/test_code_context_extractor.py index aa4e2880f..b7cce0869 100644 --- a/tests/test_code_context_extractor.py +++ b/tests/test_code_context_extractor.py @@ -84,7 +84,8 @@ def test_code_replacement10() -> None: code_ctx = get_code_optimization_context(function_to_optimize=func_top_optimize, project_root_path=file_path.parent) qualified_names = {func.qualified_name for func in code_ctx.helper_functions} - assert qualified_names == {"HelperClass.helper_method"} # Nested method should not be in here + # HelperClass.__init__ is now tracked because HelperClass(self.name) instantiates the class + assert qualified_names == {"HelperClass.helper_method", "HelperClass.__init__"} # Nested method should not be in here read_write_context, read_only_context = code_ctx.read_writable_code, code_ctx.read_only_context_code hashing_context = code_ctx.hashing_code_context @@ -570,6 +571,8 @@ def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _R: class AbstractCacheBackend(CacheBackend, Protocol[_KEY_T, _STORE_T]): """Interface for cache backends used by the persistent cache decorator.""" + def __init__(self) -> None: ... + def hash_key( self, *, @@ -1296,6 +1299,8 @@ def __repr__(self) -> str: ``` ```python:{path_to_transform_utils.relative_to(project_root)} class DataTransformer: + def __init__(self): + self.data = None def transform(self, data): self.data = data @@ -1599,7 +1604,11 @@ def __repr__(self) -> str: \"\"\"Return a string representation of the DataProcessor.\"\"\" return f"DataProcessor(default_prefix={{self.default_prefix!r}})" ``` - +```python:{path_to_transform_utils.relative_to(project_root)} +class DataTransformer: + def __init__(self): + self.data = None +``` """ expected_hashing_context = f""" ```python:utils.py @@ -1705,6 +1714,7 @@ def test_direct_module_import() -> None: expected_read_only_context = """ ```python:utils.py +import math from transform_utils import DataTransformer class DataProcessor: @@ -1712,6 +1722,11 @@ class DataProcessor: number = 1 + def __init__(self, default_prefix: str = "PREFIX_"): + \"\"\"Initialize the DataProcessor with a default prefix.\"\"\" + self.default_prefix = default_prefix + self.number += math.log(self.number) + def __repr__(self) -> str: \"\"\"Return a string representation of the DataProcessor.\"\"\" return f"DataProcessor(default_prefix={self.default_prefix!r})" @@ -2727,3 +2742,154 @@ async def async_function(): # Verify correct order expected_order = ["GLOBAL_CONSTANT", "ANOTHER_CONSTANT", "FINAL_ASSIGNMENT"] assert collector.assignment_order == expected_order + + +def test_class_instantiation_includes_init_as_helper(tmp_path: Path) -> None: + """Test that when a class is instantiated, its __init__ method is tracked as a helper. + + This test verifies the fix for the bug where class constructors were not + included in the context when only the class instantiation was called + (not any other methods). This caused LLMs to not know the constructor + signatures when generating tests. + """ + code = ''' +class DataDumper: + """A class that dumps data.""" + + def __init__(self, data): + """Initialize with data.""" + self.data = data + + def dump(self): + """Dump the data.""" + return self.data + + +def target_function(): + # Only instantiates DataDumper, doesn't call any other methods + dumper = DataDumper({"key": "value"}) + return dumper +''' + file_path = tmp_path / "test_code.py" + file_path.write_text(code, encoding="utf-8") + opt = Optimizer( + Namespace( + project_root=file_path.parent.resolve(), + disable_telemetry=True, + tests_root="tests", + test_framework="pytest", + pytest_cmd="pytest", + experiment_id=None, + test_project_root=Path().resolve(), + ) + ) + function_to_optimize = FunctionToOptimize( + function_name="target_function", + file_path=file_path, + parents=[], + starting_line=None, + ending_line=None, + ) + + code_ctx = get_code_optimization_context(function_to_optimize, opt.args.project_root) + + # The __init__ method should be tracked as a helper since DataDumper() instantiates the class + qualified_names = {func.qualified_name for func in code_ctx.helper_functions} + assert "DataDumper.__init__" in qualified_names, ( + "DataDumper.__init__ should be tracked as a helper when the class is instantiated" + ) + + # The testgen context should contain the class with __init__ (critical for LLM to know constructor) + testgen_context = code_ctx.testgen_context.markdown + assert "class DataDumper:" in testgen_context, "DataDumper class should be in testgen context" + assert "def __init__(self, data):" in testgen_context, ( + "__init__ method should be included in testgen context" + ) + + # The hashing context should NOT contain __init__ (excluded for stability) + hashing_context = code_ctx.hashing_code_context + assert "__init__" not in hashing_context, ( + "__init__ should NOT be in hashing context (excluded for hash stability)" + ) + + +def test_class_instantiation_preserves_full_class_in_testgen(tmp_path: Path) -> None: + """Test that instantiated classes are fully preserved in testgen context. + + This is specifically for the unstructured LayoutDumper bug where helper classes + that were instantiated but had no other methods called were being excluded + from the testgen context. + """ + code = ''' +class LayoutDumper: + """Base class for layout dumpers.""" + layout_source: str = "unknown" + + def __init__(self, layout): + self._layout = layout + + def dump(self) -> dict: + raise NotImplementedError() + + +class ObjectDetectionLayoutDumper(LayoutDumper): + """Specific dumper for object detection layouts.""" + + def __init__(self, layout): + super().__init__(layout) + + def dump(self) -> dict: + return {"type": "object_detection", "layout": self._layout} + + +def dump_layout(layout_type, layout): + """Dump a layout based on its type.""" + if layout_type == "object_detection": + dumper = ObjectDetectionLayoutDumper(layout) + else: + dumper = LayoutDumper(layout) + return dumper.dump() +''' + file_path = tmp_path / "test_code.py" + file_path.write_text(code, encoding="utf-8") + opt = Optimizer( + Namespace( + project_root=file_path.parent.resolve(), + disable_telemetry=True, + tests_root="tests", + test_framework="pytest", + pytest_cmd="pytest", + experiment_id=None, + test_project_root=Path().resolve(), + ) + ) + function_to_optimize = FunctionToOptimize( + function_name="dump_layout", + file_path=file_path, + parents=[], + starting_line=None, + ending_line=None, + ) + + code_ctx = get_code_optimization_context(function_to_optimize, opt.args.project_root) + qualified_names = {func.qualified_name for func in code_ctx.helper_functions} + + # Both class __init__ methods should be tracked as helpers + assert "ObjectDetectionLayoutDumper.__init__" in qualified_names, ( + "ObjectDetectionLayoutDumper.__init__ should be tracked" + ) + assert "LayoutDumper.__init__" in qualified_names, ( + "LayoutDumper.__init__ should be tracked" + ) + + # The testgen context should include both classes with their __init__ methods + testgen_context = code_ctx.testgen_context.markdown + assert "class LayoutDumper:" in testgen_context, "LayoutDumper should be in testgen context" + assert "class ObjectDetectionLayoutDumper" in testgen_context, ( + "ObjectDetectionLayoutDumper should be in testgen context" + ) + + # Both __init__ methods should be in the testgen context (so LLM knows constructor signatures) + assert testgen_context.count("def __init__") >= 2, ( + "Both __init__ methods should be in testgen context" + ) diff --git a/tests/test_instrument_line_profiler.py b/tests/test_instrument_line_profiler.py index 71d1005c0..675db5944 100644 --- a/tests/test_instrument_line_profiler.py +++ b/tests/test_instrument_line_profiler.py @@ -55,6 +55,7 @@ def hi(): class BubbleSortClass: + @codeflash_line_profile def __init__(self): pass @@ -117,7 +118,9 @@ def sort_classmethod(x): return y.sorter(x) """ assert code_path.read_text("utf-8") == expected_code_main - assert code_context.helper_functions.__len__() == 0 + # WrapperClass.__init__ is now detected as a helper since WrapperClass.BubbleSortClass() instantiates it + assert len(code_context.helper_functions) == 1 + assert code_context.helper_functions[0].qualified_name == "WrapperClass.__init__" finally: func_optimizer.write_code_and_helpers( func_optimizer.function_to_optimize_source_code, original_helper_code, func_optimizer.function_to_optimize.file_path @@ -283,6 +286,7 @@ def sorter(arr): ans = helper(arr) return ans class helper: + @codeflash_line_profile def __init__(self, arr): return arr.sort() """ diff --git a/tests/test_remove_unused_definitions.py b/tests/test_remove_unused_definitions.py index 86a57bb6d..8d09a95e1 100644 --- a/tests/test_remove_unused_definitions.py +++ b/tests/test_remove_unused_definitions.py @@ -337,6 +337,65 @@ def unused_function(): result = remove_unused_definitions_by_function_names(code, qualified_functions) assert result.strip() == expected.strip() +def test_base_class_inheritance() -> None: + """Test that base classes used only for inheritance are preserved.""" + code = """ +class LayoutDumper: + def dump(self): + raise NotImplementedError + +class ObjectDetectionLayoutDumper(LayoutDumper): + def __init__(self, data): + self.data = data + def dump(self): + return self.data + +class ExtractedLayoutDumper(LayoutDumper): + def __init__(self, data): + self.data = data + def dump(self): + return self.data + +class UnusedClass: + pass + +def test_function(): + dumper = ObjectDetectionLayoutDumper({}) + return dumper.dump() +""" + + expected = """ +class LayoutDumper: + def dump(self): + raise NotImplementedError + +class ObjectDetectionLayoutDumper(LayoutDumper): + def __init__(self, data): + self.data = data + def dump(self): + return self.data + +class ExtractedLayoutDumper(LayoutDumper): + def __init__(self, data): + self.data = data + def dump(self): + return self.data + +class UnusedClass: + pass + +def test_function(): + dumper = ObjectDetectionLayoutDumper({}) + return dumper.dump() +""" + + qualified_functions = {"test_function"} + result = remove_unused_definitions_by_function_names(code, qualified_functions) + # LayoutDumper should be preserved because ObjectDetectionLayoutDumper inherits from it + assert "class LayoutDumper" in result + assert "class ObjectDetectionLayoutDumper" in result + + def test_conditional_and_loop_variables() -> None: """Test handling of variables defined in if-else and while loops.""" code = """ From c18af789c945421b101644ec506bbd673d6d5cfa Mon Sep 17 00:00:00 2001 From: mohammed ahmed <64513301+mohammedahmed18@users.noreply.github.com> Date: Wed, 7 Jan 2026 00:24:49 +0000 Subject: [PATCH 09/12] fixes --- codeflash/code_utils/config_consts.py | 7 +++++-- codeflash/optimization/function_optimizer.py | 11 +++++------ 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index f3a881e1b..97be8808c 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -1,5 +1,7 @@ +from __future__ import annotations + from enum import Enum -from typing import Any +from typing import Any, Union MAX_TEST_RUN_ITERATIONS = 5 INDIVIDUAL_TESTCASE_TIMEOUT = 15 @@ -83,8 +85,9 @@ class EffortKeys(str, Enum): } -def get_effort_value(key: EffortKeys, effort: EffortLevel) -> Any: # noqa: ANN401 +def get_effort_value(key: EffortKeys, effort: Union[EffortLevel,str]) -> Any: # noqa: ANN401 key_str = key.value + effort = effort.value if isinstance(effort, EffortLevel) else effort if key_str in EFFORT_VALUES: if effort in EFFORT_VALUES[key_str]: return EFFORT_VALUES[key_str][effort] diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 0b525e3e7..05f6965c9 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -182,8 +182,8 @@ def __init__( self, initial_candidates: list[OptimizedCandidate], future_line_profile_results: concurrent.futures.Future, - ai_service_client: AiServiceClient, eval_ctx: CandidateEvaluationContext, + effort: str, original_markdown_code: str, future_all_refinements: list[concurrent.futures.Future], future_all_code_repair: list[concurrent.futures.Future], @@ -193,11 +193,11 @@ def __init__( self.forest = CandidateForest() self.line_profiler_done = False self.refinement_done = False + self.eval_ctx = eval_ctx + self.effort = effort self.candidate_len = len(initial_candidates) - self.ai_service_client = ai_service_client self.refinement_calls_count = 0 self.original_markdown_code = original_markdown_code - self.eval_ctx = eval_ctx # Initialize queue with initial candidates for candidate in initial_candidates: @@ -296,7 +296,7 @@ def _filter_refined_candidates(self, candidates: list[OptimizedCandidate]) -> li top_n_candidates = int( min( int(get_effort_value(EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT, self.effort)), - len(self.all_refinements_data), + len(candidates), ) ) @@ -1045,12 +1045,11 @@ def determine_best_candidate( processor = CandidateProcessor( candidates, future_line_profile_results, - self.aiservice_client, eval_ctx, + self.effort, code_context.read_writable_code.markdown, self.future_all_refinements, self.future_all_code_repair, - self.effort, self.future_adaptive_optimizations, ) candidate_index = 0 From 2a86446fdcc61c96171574595d1b84bef39bfc1e Mon Sep 17 00:00:00 2001 From: mohammed ahmed <64513301+mohammedahmed18@users.noreply.github.com> Date: Wed, 7 Jan 2026 00:41:39 +0000 Subject: [PATCH 10/12] formatting and linting --- codeflash/code_utils/config_consts.py | 21 ++++++++++++-------- codeflash/optimization/function_optimizer.py | 5 +---- 2 files changed, 14 insertions(+), 12 deletions(-) diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index 97be8808c..1ee373579 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -85,13 +85,18 @@ class EffortKeys(str, Enum): } -def get_effort_value(key: EffortKeys, effort: Union[EffortLevel,str]) -> Any: # noqa: ANN401 +def get_effort_value(key: EffortKeys, effort: Union[EffortLevel, str]) -> Any: # noqa: ANN401 key_str = key.value - effort = effort.value if isinstance(effort, EffortLevel) else effort - if key_str in EFFORT_VALUES: - if effort in EFFORT_VALUES[key_str]: - return EFFORT_VALUES[key_str][effort] - msg = f"Invalid effort level: {effort}" + + if isinstance(effort, str): + try: + effort = EffortLevel(effort) + except ValueError: + msg = f"Invalid effort level: {effort}" + raise ValueError(msg) from None + + if key_str not in EFFORT_VALUES: + msg = f"Invalid key: {key_str}" raise ValueError(msg) - msg = f"Invalid key: {key_str}" - raise ValueError(msg) + + return EFFORT_VALUES[key_str][effort] diff --git a/codeflash/optimization/function_optimizer.py b/codeflash/optimization/function_optimizer.py index 05f6965c9..79878d991 100644 --- a/codeflash/optimization/function_optimizer.py +++ b/codeflash/optimization/function_optimizer.py @@ -294,10 +294,7 @@ def _filter_refined_candidates(self, candidates: list[OptimizedCandidate]) -> li self.refinement_calls_count += len(candidates) top_n_candidates = int( - min( - int(get_effort_value(EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT, self.effort)), - len(candidates), - ) + min(int(get_effort_value(EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT, self.effort)), len(candidates)) ) if len(candidates) == top_n_candidates: From ca9769c7689f29e0a9ec4414684a390820a8058e Mon Sep 17 00:00:00 2001 From: mohammed ahmed <64513301+mohammedahmed18@users.noreply.github.com> Date: Wed, 7 Jan 2026 00:53:08 +0000 Subject: [PATCH 11/12] modify effort values --- codeflash/code_utils/config_consts.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index 1ee373579..ca79ebbd9 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -65,22 +65,24 @@ class EffortKeys(str, Enum): EffortKeys.N_OPTIMIZER_LP_CANDIDATES.value: {EffortLevel.LOW: 4, EffortLevel.MEDIUM: 6, EffortLevel.HIGH: 7}, # we don't use effort with generated tests for now EffortKeys.N_GENERATED_TESTS.value: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 2, EffortLevel.HIGH: 2}, - # maximum number of repairs we will do for each function - EffortKeys.MAX_CODE_REPAIRS_PER_TRACE.value: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 4, EffortLevel.HIGH: 5}, + # maximum number of repairs we will do for each function (in case the valid candidates is less than MIN_CORRECT_CANDIDATES) + EffortKeys.MAX_CODE_REPAIRS_PER_TRACE.value: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 3, EffortLevel.HIGH: 5}, # if the percentage of unmatched tests is greater than this, we won't fix it (lowering this value makes the repair more stricted) # on the low effort we lower the limit to 20% to be more strict (less repairs, less time) EffortKeys.REPAIR_UNMATCHED_PERCENTAGE_LIMIT.value: { EffortLevel.LOW: 0.2, - EffortLevel.MEDIUM: 0.4, - EffortLevel.HIGH: 0.5, + EffortLevel.MEDIUM: 0.3, + EffortLevel.HIGH: 0.4, }, # Top valid candidates for refinements EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 3, EffortLevel.HIGH: 4}, - EffortKeys.ADAPTIVE_OPTIMIZATION_THRESHOLD.value: {EffortLevel.LOW: 0, EffortLevel.MEDIUM: 1, EffortLevel.HIGH: 3}, + # max number of adaptive optimization calls to make per a single candidates tree + EffortKeys.ADAPTIVE_OPTIMIZATION_THRESHOLD.value: {EffortLevel.LOW: 0, EffortLevel.MEDIUM: 1, EffortLevel.HIGH: 2}, + # max number of adaptive optimization calls to make per a single trace EffortKeys.MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE.value: { EffortLevel.LOW: 0, - EffortLevel.MEDIUM: 3, - EffortLevel.HIGH: 10, + EffortLevel.MEDIUM: 2, + EffortLevel.HIGH: 4, }, } From 0a33bc10ef93c511d6fa1bb2ff24380f11a0d45e Mon Sep 17 00:00:00 2001 From: ali Date: Wed, 7 Jan 2026 18:36:54 +0200 Subject: [PATCH 12/12] disable adaptive optimization for medium effort --- codeflash/code_utils/config_consts.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/codeflash/code_utils/config_consts.py b/codeflash/code_utils/config_consts.py index ca79ebbd9..96d6b8e14 100644 --- a/codeflash/code_utils/config_consts.py +++ b/codeflash/code_utils/config_consts.py @@ -77,11 +77,11 @@ class EffortKeys(str, Enum): # Top valid candidates for refinements EffortKeys.TOP_VALID_CANDIDATES_FOR_REFINEMENT: {EffortLevel.LOW: 2, EffortLevel.MEDIUM: 3, EffortLevel.HIGH: 4}, # max number of adaptive optimization calls to make per a single candidates tree - EffortKeys.ADAPTIVE_OPTIMIZATION_THRESHOLD.value: {EffortLevel.LOW: 0, EffortLevel.MEDIUM: 1, EffortLevel.HIGH: 2}, + EffortKeys.ADAPTIVE_OPTIMIZATION_THRESHOLD.value: {EffortLevel.LOW: 0, EffortLevel.MEDIUM: 0, EffortLevel.HIGH: 2}, # max number of adaptive optimization calls to make per a single trace EffortKeys.MAX_ADAPTIVE_OPTIMIZATIONS_PER_TRACE.value: { EffortLevel.LOW: 0, - EffortLevel.MEDIUM: 2, + EffortLevel.MEDIUM: 0, EffortLevel.HIGH: 4, }, }