Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 29 additions & 3 deletions Containerfile
Original file line number Diff line number Diff line change
@@ -1,6 +1,22 @@
# Stage 0: grab uv binary from official uv image
FROM ghcr.io/astral-sh/uv:latest AS uvbin

# Stage 1: build drain3-rs wheel (Rust toolchain kept out of the final image)
FROM python:3.11-slim AS rustbuilder

RUN apt-get update && apt-get install -y --no-install-recommends \
curl build-essential git \
&& rm -rf /var/lib/apt/lists/*

RUN curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y --default-toolchain stable
ENV PATH="/root/.cargo/bin:${PATH}"

RUN pip install --no-cache-dir maturin

RUN pip wheel --no-cache-dir \
git+https://github.com/Log-Analyzer/Drain3-rs.git \
-w /wheels

# Final image: start from Red Hat UBI python image
# More details: https://catalog.redhat.com/en/software/containers/ubi9/python-311/63f764b03f0b02a2e2d63fff#overview
FROM registry.access.redhat.com/ubi9/python-311:9.7
Expand Down Expand Up @@ -30,17 +46,27 @@ RUN java -version && python3 --version && pip3 --version && uv --version
WORKDIR /opt/app-root/src

# Install all Python deps in one layer, then remove build-only packages
COPY --from=rustbuilder /wheels/drain3*.whl /tmp/
COPY --chmod=755 requirements.txt .
RUN uv venv --python python3.11 \
&& uv pip install --no-cache-dir -r requirements.txt \
RUN grep -v "^drain3" requirements.txt > /tmp/reqs_no_drain3.txt \
&& uv venv --python python3.11 \
&& uv pip install --no-cache-dir /tmp/drain3*.whl \
&& uv pip install --no-cache-dir -r /tmp/reqs_no_drain3.txt \
&& uv pip install --no-cache-dir \
torch==2.2.2+cpu \
--index-url https://download.pytorch.org/whl/cpu
--index-url https://download.pytorch.org/whl/cpu \
&& rm /tmp/drain3*.whl /tmp/reqs_no_drain3.txt

# Install logan package
COPY --chmod=755 . .
RUN uv pip install --no-cache-dir . --no-deps

# Pre-download DuckDB WASM assets so first-run requires no network access
RUN HOME=/opt/app-root/src .venv/bin/python -c \
"from logan.store.duckdb_assets import ensure_duckdb_assets; ensure_duckdb_assets()" \
&& chown -R 1001:0 /opt/app-root/src/.cache \
&& chmod -R g+rwx /opt/app-root/src/.cache

# Clean up build-only packages
RUN yum remove -y make gcc python3-devel \
&& yum clean all \
Expand Down
6 changes: 1 addition & 5 deletions logan/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,11 +243,7 @@ def analyze(files, glob, time_range, output_dir, debug_mode, process_all_files,
click.echo(click.style("\nStep 2: Generating log templates...", fg="cyan"))
drain_config_path = os.path.join(os.path.dirname(__file__), 'drain', 'drain3.ini')
templatizer = Templatizer(debug_mode=debug_mode_str, config_path=drain_config_path)
templatizer.miner(
preprocessing_obj.df,
output_dir,
os.path.join(output_dir, "test_templates", "tm-test.templates.json")
)
templatizer.miner(preprocessing_obj.df, output_dir)
click.echo(click.style(" Templates generated successfully", fg="green"))

# Step 3: Anomaly detection
Expand Down
16 changes: 8 additions & 8 deletions logan/drain/drain3.ini
Original file line number Diff line number Diff line change
@@ -1,15 +1,15 @@
[SNAPSHOT]
snapshot_interval_minutes = 1
snapshot_interval_minutes = 60
compress_state = False

[MASKING]
masking = [
{"regex_pattern":"((?<=[^A-Za-z0-9])|^)(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))((?=[^A-Za-z0-9])|$)", "mask_with": "ID"},
{"regex_pattern":"((?<=[^A-Za-z0-9])|^)(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})((?=[^A-Za-z0-9])|$)", "mask_with": "IP"},
{"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9a-f]{6,} ?){3,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"},
{"regex_pattern":"((?<=[^A-Za-z0-9])|^)([0-9A-F]{4} ?){4,}((?=[^A-Za-z0-9])|$)", "mask_with": "SEQ"},
{"regex_pattern":"((?<=[^A-Za-z0-9])|^)(0x[a-f0-9A-F]+)((?=[^A-Za-z0-9])|$)", "mask_with": "HEX"},
{"regex_pattern":"((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "mask_with": "NUM"},
{"regex_pattern":"\\b(([0-9a-f]{2,}:){3,}([0-9a-f]{2,}))\\b", "mask_with": "ID"},
{"regex_pattern":"\\b(\\d{1,3}\\.\\d{1,3}\\.\\d{1,3}\\.\\d{1,3})\\b", "mask_with": "IP"},
{"regex_pattern":"\\b([0-9a-f]{6,} ?){3,}\\b", "mask_with": "SEQ"},
{"regex_pattern":"\\b([0-9A-F]{4} ?){4,}\\b", "mask_with": "SEQ"},
{"regex_pattern":"\\b(0x[a-f0-9A-F]+)\\b", "mask_with": "HEX"},
{"regex_pattern":"\\b([\\-\\+]?\\d+)\\b", "mask_with": "NUM"},
{"regex_pattern":"(?<=executed cmd )(\".+?\")", "mask_with": "CMD"}
]
mask_prefix = <:
Expand All @@ -26,5 +26,5 @@ max_clusters = 50000
extra_delimiters = ["_"]

[PROFILING]
enabled = True
enabled = False
report_sec = 30
42 changes: 28 additions & 14 deletions logan/drain/run_drain.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,24 @@
import time
from drain3.template_miner import TemplateMiner
from drain3.template_miner_config import TemplateMinerConfig
from drain3.file_persistence import FilePersistence
from pandarallel import pandarallel

from logan.store.store import LogStore

_pandarallel_initialized = False
_pandarallel_disabled = False

def _ensure_pandarallel():
global _pandarallel_initialized, _pandarallel_disabled
if _pandarallel_initialized:
return
if os.environ.get("LOGAN_DISABLE_PANDARALLEL") == "1":
_pandarallel_disabled = True
_pandarallel_initialized = True
return
pandarallel.initialize(progress_bar=False, nb_workers=os.cpu_count() or 2)
_pandarallel_initialized = True

class Templatizer:
"""
The Templatizer class is responsible for mining log templates using the DRAIN3 algorithm.
Expand All @@ -24,7 +38,7 @@ class Templatizer:
compute_drain_statistics(time, output_dir):
Logs and stores the time taken for the DRAIN3 template mining process.

miner(df, output_dir, template):
miner(df, output_dir):
Mines log templates from the given DataFrame and saves the templates to a specified path.
"""

Expand Down Expand Up @@ -67,7 +81,7 @@ def compute_drain_statistics(self, time_taken: float, output_dir: str):
with open(os.path.join(output_dir, "metrics", "drain.json"), 'w') as writer:
writer.write(json.dumps(metrics, indent=4))

def miner(self, df, output_dir: str, template: str):
def miner(self, df, output_dir: str):
"""
Apply the DRAIN3 template mining algorithm to the given DataFrame.

Expand All @@ -77,7 +91,6 @@ def miner(self, df, output_dir: str, template: str):
Args:
df (pd.DataFrame): DataFrame containing log data with a 'truncated_log' column for mining.
output_dir (str): The directory where output files, such as statistics, will be saved.
template (str): The file path where the mined templates will be stored.
"""
# Record the start time of the DRAIN3 mining process
start_time = time.time()
Expand All @@ -87,11 +100,8 @@ def miner(self, df, output_dir: str, template: str):
config = TemplateMinerConfig()
config.load(self.config_path)

# Set up file persistence to store the learned templates in the specified template file
mem_persistence = FilePersistence(template)

# Initialize the TemplateMiner with the loaded configuration and file persistence
template_miner_temporary = TemplateMiner(mem_persistence, config)
# Initialize the TemplateMiner with no persistence (single-shot analysis, no disk I/O)
template_miner_temporary = TemplateMiner(None, config)

# Preserve original log text before Drain3 masking overwrites it
df["original_text"] = df["text"].astype(str)
Expand All @@ -103,16 +113,20 @@ def miner(self, df, output_dir: str, template: str):
try:
test_ids = []
template_strs = []
variables_list = []
for log in df["truncated_log"].values:
result = template_miner_temporary.add_log_message(log)
test_ids.append(result['cluster_id'])
tmpl = result.get('template_mined', '')
template_strs.append(tmpl)
variables_list.append(LogStore.extract_variables(log, tmpl))
template_strs.append(result.get('template_mined', ''))
df["test_ids"] = test_ids
df["template_str"] = template_strs
df["variables"] = [json.dumps(v) for v in variables_list]

# Extract variables in parallel — embarrassingly parallel, independent of drain order
_ensure_pandarallel()
apply_fn = df.apply if _pandarallel_disabled else df.parallel_apply
df["variables"] = apply_fn(
lambda row: json.dumps(LogStore.extract_variables(row["truncated_log"], row["template_str"])),
axis=1,
)

if (self.debug_mode == "true"):
template_log_dict = df.groupby("test_ids")["truncated_log"].agg(list).to_dict()
Expand Down
10 changes: 2 additions & 8 deletions logan/mcp/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -206,10 +206,7 @@ def _run_drain():
os.path.dirname(os.path.dirname(__file__)), "drain", "drain3.ini"
)
templatizer = Templatizer(debug_mode=debug_str, config_path=drain_config)
templatizer.miner(
df, output_dir,
os.path.join(output_dir, "test_templates", "tm-test.templates.json"),
)
templatizer.miner(df, output_dir)
return templatizer.df

templatized_df = await loop.run_in_executor(None, _run_drain)
Expand Down Expand Up @@ -368,10 +365,7 @@ def _run_drain():
os.path.dirname(os.path.dirname(__file__)), "drain", "drain3.ini"
)
templatizer = Templatizer(debug_mode="true", config_path=drain_config)
templatizer.miner(
df, output_dir,
os.path.join(output_dir, "test_templates", "tm-test.templates.json"),
)
templatizer.miner(df, output_dir)

# Extract template patterns from Drain3's internal clusters
config = TemplateMinerConfig()
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,5 +19,5 @@ transformers==4.56.2
openpyxl==3.1.4
numpy==1.26.4
setuptools==70.0.0
drain3==0.9.11
drain3 @ git+https://github.com/Log-Analyzer/Drain3-rs.git
click==8.3.1