From 8e1ba41337089e230cdfbcb61a7c6ffe2eff4f0c Mon Sep 17 00:00:00 2001 From: Rahul Shetty Date: Fri, 22 May 2026 16:38:11 +0530 Subject: [PATCH 1/3] feat: switch to drain3 rust Signed-off-by: Rahul Shetty --- Containerfile | 31 ++++++++++++++++++++++++++++--- requirements.txt | 2 +- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/Containerfile b/Containerfile index 92d7b8c..8ca9222 100644 --- a/Containerfile +++ b/Containerfile @@ -1,6 +1,22 @@ # Stage 0: grab uv binary from official uv image FROM ghcr.io/astral-sh/uv:latest AS uvbin +# Stage 1: build drain3-rs wheel (Rust toolchain kept out of the final image) +FROM python:3.11-slim AS rustbuilder + +RUN apt-get update && apt-get install -y --no-install-recommends \ + curl build-essential git \ + && rm -rf /var/lib/apt/lists/* + +RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable +ENV PATH="/root/.cargo/bin:${PATH}" + +RUN pip install --no-cache-dir maturin + +RUN pip wheel --no-cache-dir \ + git+https://github.com/Log-Analyzer/Drain3-rs.git \ + -w /wheels + # Final image: start from Red Hat UBI python image # More details: https://catalog.redhat.com/en/software/containers/ubi9/python-311/63f764b03f0b02a2e2d63fff#overview FROM registry.access.redhat.com/ubi9/python-311:9.7 @@ -30,12 +46,16 @@ RUN java -version && python3 --version && pip3 --version && uv --version WORKDIR /opt/app-root/src # Install all Python deps in one layer, then remove build-only packages +COPY --from=rustbuilder /wheels/drain3*.whl /tmp/ COPY --chmod=755 requirements.txt . -RUN uv venv --python python3.11 \ - && uv pip install --no-cache-dir -r requirements.txt \ +RUN grep -v "^drain3" requirements.txt > /tmp/reqs_no_drain3.txt \ + && uv venv --python python3.11 \ + && uv pip install --no-cache-dir /tmp/drain3*.whl \ + && uv pip install --no-cache-dir -r /tmp/reqs_no_drain3.txt \ && uv pip install --no-cache-dir \ torch==2.2.2+cpu \ - --index-url https://download.pytorch.org/whl/cpu + --index-url https://download.pytorch.org/whl/cpu \ + && rm /tmp/drain3*.whl /tmp/reqs_no_drain3.txt # Install logan package COPY --chmod=755 . . @@ -49,6 +69,11 @@ RUN yum remove -y make gcc python3-devel \ # Redirect runtime caches to /tmp so non-root user can write to them ENV HF_HOME="/tmp/hf_cache" +# Pre-create the Logan DuckDB WASM cache dir owned by the runtime user +RUN mkdir -p /opt/app-root/src/.cache/logan \ + && chown -R 1001:0 /opt/app-root/src/.cache \ + && chmod -R g+rwx /opt/app-root/src/.cache + # Drop to non-root user for runtime USER 1001 diff --git a/requirements.txt b/requirements.txt index 74615f8..e4a8b73 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,5 +19,5 @@ transformers==4.56.2 openpyxl==3.1.4 numpy==1.26.4 setuptools==70.0.0 -drain3==0.9.11 +drain3 @ git+https://github.com/Log-Analyzer/Drain3-rs.git click==8.3.1 \ No newline at end of file From b83efff39b67e930e36747bcf0e8286dcd4c106b Mon Sep 17 00:00:00 2001 From: Rahul Shetty Date: Fri, 22 May 2026 18:22:24 +0530 Subject: [PATCH 2/3] improve drain3 integration Signed-off-by: Rahul Shetty --- logan/cli.py | 6 +----- logan/drain/drain3.ini | 16 +++++++-------- logan/drain/run_drain.py | 42 ++++++++++++++++++++++++++-------------- logan/mcp/server.py | 10 ++-------- 4 files changed, 39 insertions(+), 35 deletions(-) diff --git a/logan/cli.py b/logan/cli.py index bb3b591..a57efe4 100644 --- a/logan/cli.py +++ b/logan/cli.py @@ -243,11 +243,7 @@ def analyze(files, glob, time_range, output_dir, debug_mode, process_all_files, click.echo(click.style("\nStep 2: Generating log templates...", fg="cyan")) drain_config_path = os.path.join(os.path.dirname(__file__), 'drain', 'drain3.ini') templatizer = Templatizer(debug_mode=debug_mode_str, config_path=drain_config_path) - templatizer.miner( - preprocessing_obj.df, - output_dir, - os.path.join(output_dir, "test_templates", "tm-test.templates.json") - ) + templatizer.miner(preprocessing_obj.df, output_dir) click.echo(click.style(" Templates generated successfully", fg="green")) # Step 3: Anomaly detection diff --git a/logan/drain/drain3.ini b/logan/drain/drain3.ini index 017bd5c..e044534 100644 --- a/logan/drain/drain3.ini +++ b/logan/drain/drain3.ini @@ -1,15 +1,15 @@ [SNAPSHOT] -snapshot_interval_minutes = 1 +snapshot_interval_minutes = 60 compress_state = False [MASKING] masking = [ - {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)", "mask_with": "ID"}, - {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})((?=[^A-Za-z0-9])|$)", "mask_with": "IP"}, - {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"}, - {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"}, - {"regex_pattern":"((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)", "mask_with": "HEX"}, - {"regex_pattern":"((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "mask_with": "NUM"}, + {"regex_pattern":"\\b(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))\\b", "mask_with": "ID"}, + {"regex_pattern":"\\b(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\\b", "mask_with": "IP"}, + {"regex_pattern":"\\b([0-9a-f]{6,} ?){3,}\\b", "mask_with": "SEQ"}, + {"regex_pattern":"\\b([0-9A-F]{4} ?){4,}\\b", "mask_with": "SEQ"}, + {"regex_pattern":"\\b(0x[a-f0-9A-F]+)\\b", "mask_with": "HEX"}, + {"regex_pattern":"\\b([\\-\\+]?\\d+)\\b", "mask_with": "NUM"}, {"regex_pattern":"(?<=executed cmd )(\".+?\")", "mask_with": "CMD"} ] mask_prefix = <: @@ -26,5 +26,5 @@ max_clusters = 50000 extra_delimiters = ["_"] [PROFILING] -enabled = True +enabled = False report_sec = 30 diff --git a/logan/drain/run_drain.py b/logan/drain/run_drain.py index 26f51c4..9dc4f66 100644 --- a/logan/drain/run_drain.py +++ b/logan/drain/run_drain.py @@ -5,10 +5,24 @@ import time from drain3.template_miner import TemplateMiner from drain3.template_miner_config import TemplateMinerConfig -from drain3.file_persistence import FilePersistence +from pandarallel import pandarallel from logan.store.store import LogStore +_pandarallel_initialized = False +_pandarallel_disabled = False + +def _ensure_pandarallel(): + global _pandarallel_initialized, _pandarallel_disabled + if _pandarallel_initialized: + return + if os.environ.get("LOGAN_DISABLE_PANDARALLEL") == "1": + _pandarallel_disabled = True + _pandarallel_initialized = True + return + pandarallel.initialize(progress_bar=False, nb_workers=os.cpu_count() or 2) + _pandarallel_initialized = True + class Templatizer: """ The Templatizer class is responsible for mining log templates using the DRAIN3 algorithm. @@ -24,7 +38,7 @@ class Templatizer: compute_drain_statistics(time, output_dir): Logs and stores the time taken for the DRAIN3 template mining process. - miner(df, output_dir, template): + miner(df, output_dir): Mines log templates from the given DataFrame and saves the templates to a specified path. """ @@ -67,7 +81,7 @@ def compute_drain_statistics(self, time_taken: float, output_dir: str): with open(os.path.join(output_dir, "metrics", "drain.json"), 'w') as writer: writer.write(json.dumps(metrics, indent=4)) - def miner(self, df, output_dir: str, template: str): + def miner(self, df, output_dir: str): """ Apply the DRAIN3 template mining algorithm to the given DataFrame. @@ -77,7 +91,6 @@ def miner(self, df, output_dir: str, template: str): Args: df (pd.DataFrame): DataFrame containing log data with a 'truncated_log' column for mining. output_dir (str): The directory where output files, such as statistics, will be saved. - template (str): The file path where the mined templates will be stored. """ # Record the start time of the DRAIN3 mining process start_time = time.time() @@ -87,11 +100,8 @@ def miner(self, df, output_dir: str, template: str): config = TemplateMinerConfig() config.load(self.config_path) - # Set up file persistence to store the learned templates in the specified template file - mem_persistence = FilePersistence(template) - - # Initialize the TemplateMiner with the loaded configuration and file persistence - template_miner_temporary = TemplateMiner(mem_persistence, config) + # Initialize the TemplateMiner with no persistence (single-shot analysis, no disk I/O) + template_miner_temporary = TemplateMiner(None, config) # Preserve original log text before Drain3 masking overwrites it df["original_text"] = df["text"].astype(str) @@ -103,16 +113,20 @@ def miner(self, df, output_dir: str, template: str): try: test_ids = [] template_strs = [] - variables_list = [] for log in df["truncated_log"].values: result = template_miner_temporary.add_log_message(log) test_ids.append(result['cluster_id']) - tmpl = result.get('template_mined', '') - template_strs.append(tmpl) - variables_list.append(LogStore.extract_variables(log, tmpl)) + template_strs.append(result.get('template_mined', '')) df["test_ids"] = test_ids df["template_str"] = template_strs - df["variables"] = [json.dumps(v) for v in variables_list] + + # Extract variables in parallel — embarrassingly parallel, independent of drain order + _ensure_pandarallel() + apply_fn = df.apply if _pandarallel_disabled else df.parallel_apply + df["variables"] = apply_fn( + lambda row: json.dumps(LogStore.extract_variables(row["truncated_log"], row["template_str"])), + axis=1, + ) if (self.debug_mode == "true"): template_log_dict = df.groupby("test_ids")["truncated_log"].agg(list).to_dict() diff --git a/logan/mcp/server.py b/logan/mcp/server.py index b43c224..45ff26c 100644 --- a/logan/mcp/server.py +++ b/logan/mcp/server.py @@ -206,10 +206,7 @@ def _run_drain(): os.path.dirname(os.path.dirname(__file__)), "drain", "drain3.ini" ) templatizer = Templatizer(debug_mode=debug_str, config_path=drain_config) - templatizer.miner( - df, output_dir, - os.path.join(output_dir, "test_templates", "tm-test.templates.json"), - ) + templatizer.miner(df, output_dir) return templatizer.df templatized_df = await loop.run_in_executor(None, _run_drain) @@ -368,10 +365,7 @@ def _run_drain(): os.path.dirname(os.path.dirname(__file__)), "drain", "drain3.ini" ) templatizer = Templatizer(debug_mode="true", config_path=drain_config) - templatizer.miner( - df, output_dir, - os.path.join(output_dir, "test_templates", "tm-test.templates.json"), - ) + templatizer.miner(df, output_dir) # Extract template patterns from Drain3's internal clusters config = TemplateMinerConfig() From c8b9caac582556da3302c28f8765d835090bfc14 Mon Sep 17 00:00:00 2001 From: Rahul Shetty Date: Fri, 22 May 2026 19:42:48 +0530 Subject: [PATCH 3/3] fix: permission issues with duckdb assets cache Signed-off-by: Rahul Shetty --- Containerfile | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Containerfile b/Containerfile index 8ca9222..ba3ee56 100644 --- a/Containerfile +++ b/Containerfile @@ -61,6 +61,12 @@ RUN grep -v "^drain3" requirements.txt > /tmp/reqs_no_drain3.txt \ COPY --chmod=755 . . RUN uv pip install --no-cache-dir . --no-deps +# Pre-download DuckDB WASM assets so first-run requires no network access +RUN HOME=/opt/app-root/src .venv/bin/python -c \ + "from logan.store.duckdb_assets import ensure_duckdb_assets; ensure_duckdb_assets()" \ + && chown -R 1001:0 /opt/app-root/src/.cache \ + && chmod -R g+rwx /opt/app-root/src/.cache + # Clean up build-only packages RUN yum remove -y make gcc python3-devel \ && yum clean all \ @@ -69,11 +75,6 @@ RUN yum remove -y make gcc python3-devel \ # Redirect runtime caches to /tmp so non-root user can write to them ENV HF_HOME="/tmp/hf_cache" -# Pre-create the Logan DuckDB WASM cache dir owned by the runtime user -RUN mkdir -p /opt/app-root/src/.cache/logan \ - && chown -R 1001:0 /opt/app-root/src/.cache \ - && chmod -R g+rwx /opt/app-root/src/.cache - # Drop to non-root user for runtime USER 1001