— Ability to delay notification after X failures (fix #71)

This commit is contained in:
Luc Didry 2024-12-09 13:59:58 +01:00
parent e0edb50e12
commit 311d86d130
No known key found for this signature in database
GPG key ID: EA868E12D0257E3C
11 changed files with 151 additions and 6 deletions

View file

@ -4,6 +4,7 @@
- ✨ — IPv4/IPv6 choice for checks, and choice for a dual-stack check (#69)
- ⚡ — Mutualize check requests (#68)
- ✨ — Ability to delay notification after X failures (#71)
## 0.6.1

View file

@ -57,7 +57,7 @@ class ArgosAgent: # pylint: disable-msg=too-many-instance-attributes
self._http_client = httpx.AsyncClient(headers=auth_header)
ua_header = {
"User-Agent": f"Argos Panoptes {VERSION} "
"User-Agent": f"Prout Argos Panoptes {VERSION} "
"(about: https://argos-monitoring.framasoft.org/)",
}
self._http_client_v4 = httpx.AsyncClient(

View file

@ -64,6 +64,14 @@ general:
# For ex., to re-try a check one minute after a failure:
# recheck_delay: "1m"
# Default setting for notifications delay.
# Say you want to be warned right after a failure on a check: set it to 0
# Say you want a second failure on the check before being warned,
# to avoid network hiccups: set it to 1
# Can be superseeded in domain configuration
# If not present, default is 0
# retry_before_notification: 0
# Defaults settings for IPv4/IPv6
# Can be superseeded in domain configuration.
# By default, Argos will check both IPv4 and IPv6 addresses of a domain
@ -143,6 +151,8 @@ ssl:
#
websites:
- domain: "https://mypads.example.org"
# Wait for a second failure before sending notification
retry_before_notification: 1
paths:
- path: "/mypads/"
# Specify the method of the HTTP request

View file

@ -121,6 +121,7 @@ class Website(BaseModel):
ipv6: bool | None = None
frequency: float | None = None
recheck_delay: float | None = None
retry_before_notification: int | None = None
paths: List[WebsitePath]
@field_validator("frequency", mode="before")
@ -206,6 +207,7 @@ class General(BaseModel):
ldap: LdapSettings | None = None
frequency: float
recheck_delay: float | None = None
retry_before_notification: int = 0
ipv4: bool = True
ipv6: bool = True
root_path: str = ""

View file

@ -24,6 +24,8 @@ class Task(BaseModel):
method: Method
expected: str
task_group: str
retry_before_notification: int
contiguous_failures: int
selected_at: datetime | None
selected_by: str | None

View file

@ -11,6 +11,55 @@ import httpx
from argos.checks.base import Severity
from argos.logging import logger
from argos.schemas.config import Config, Mail, GotifyUrl
from argos.server.models import Task
def need_alert(
last_severity: str, last_severity_update, severity: str, status: str, task: Task
) -> bool:
## Create alert… or not!
send_notif = False
# Severity has changed, and no retry before notification
if last_severity != severity and task.retry_before_notification == 0:
send_notif = True
# Seems to be a first check: create a notification
elif last_severity != severity and last_severity_update is None:
send_notif = True
# As we created a notification, avoid resending it on a
# future failure
if status != "success":
task.contiguous_failures = task.retry_before_notification
# We need retry before notification, so the severity may not have changed
# since last check
elif task.retry_before_notification != 0:
# If we got a success, and we already have created a notification:
# create notification of success immediately
if (
status == "success"
and task.contiguous_failures >= task.retry_before_notification
):
send_notif = True
task.contiguous_failures = 0
# The status is not a success
elif status != "success":
# This is a new failure
task.contiguous_failures += 1
# Severity has changed, but not to success, thats odd:
# create a notification
if (
last_severity not in ("ok", severity)
and last_severity_update is not None
):
send_notif = True
# As we created a notification, avoid resending it on a
# future failure
task.contiguous_failures = task.retry_before_notification
# Severity has not changed, but there has been enough failures
# to create a notification
elif task.retry_before_notification == task.contiguous_failures:
send_notif = True
return send_notif
def get_icon_from_severity(severity: str) -> str:

View file

@ -0,0 +1,41 @@
"""Add retries before notification feature
Revision ID: 80a29f64f91c
Revises: 8b58ced14d6e
Create Date: 2024-12-04 17:03:35.104368
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = "80a29f64f91c"
down_revision: Union[str, None] = "8b58ced14d6e"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
with op.batch_alter_table("tasks", schema=None) as batch_op:
batch_op.add_column(
sa.Column(
"retry_before_notification",
sa.Integer(),
server_default="0",
nullable=False,
)
)
batch_op.add_column(
sa.Column(
"contiguous_failures", sa.Integer(), server_default="0", nullable=False
)
)
def downgrade() -> None:
with op.batch_alter_table("tasks", schema=None) as batch_op:
batch_op.drop_column("contiguous_failures")
batch_op.drop_column("retry_before_notification")

View file

@ -50,6 +50,8 @@ class Task(Base):
frequency: Mapped[float] = mapped_column()
recheck_delay: Mapped[float] = mapped_column(nullable=True)
already_retried: Mapped[bool] = mapped_column(insert_default=False)
retry_before_notification: Mapped[int] = mapped_column(insert_default=0)
contiguous_failures: Mapped[int] = mapped_column(insert_default=0)
method: Mapped[Method] = mapped_column(
Enum(
"GET",

View file

@ -106,6 +106,11 @@ async def has_config_changed(db: Session, config: schemas.Config) -> bool:
same_config = False
conf.val = str(config.general.recheck_delay)
conf.updated_at = datetime.now()
case "general_retry_before_notification":
if conf.val != str(config.general.retry_before_notification):
same_config = False
conf.val = str(config.general.retry_before_notification)
conf.updated_at = datetime.now()
db.commit()
@ -126,9 +131,15 @@ async def has_config_changed(db: Session, config: schemas.Config) -> bool:
val=str(config.general.recheck_delay),
updated_at=datetime.now(),
)
gen_retry_before_notif = ConfigCache(
name="general_retry_before_notification",
val=str(config.general.retry_before_notification),
updated_at=datetime.now(),
)
db.add(web_hash)
db.add(gen_freq)
db.add(gen_recheck)
db.add(gen_retry_before_notif)
db.commit()
return True
@ -150,6 +161,11 @@ async def update_from_config(db: Session, config: schemas.Config): # pylint: di
domain = str(website.domain)
frequency = website.frequency or config.general.frequency
recheck_delay = website.recheck_delay or config.general.recheck_delay
retry_before_notification = (
website.retry_before_notification
if website.retry_before_notification is not None
else config.general.retry_before_notification
)
ipv4 = website.ipv4 if website.ipv4 is not None else config.general.ipv4
ipv6 = website.ipv6 if website.ipv6 is not None else config.general.ipv6
if ipv4 is False and ipv6 is False:
@ -186,16 +202,25 @@ async def update_from_config(db: Session, config: schemas.Config): # pylint: di
existing_task.frequency = frequency
if recheck_delay != existing_task.recheck_delay:
existing_task.recheck_delay = recheck_delay # type: ignore[assignment]
if (
retry_before_notification
!= existing_task.retry_before_notification
):
existing_task.retry_before_notification = (
retry_before_notification
)
logger.debug(
"Skipping db task creation for url=%s, "
"method=%s, check_key=%s, expected=%s, "
"frequency=%s, recheck_delay=%s, ip_version=%s.",
"frequency=%s, recheck_delay=%s, "
"retry_before_notification=%s, ip_version=%s.",
url,
p.method,
check_key,
expected,
frequency,
recheck_delay,
retry_before_notification,
ip_version,
)
@ -212,6 +237,7 @@ async def update_from_config(db: Session, config: schemas.Config): # pylint: di
expected=expected,
frequency=frequency,
recheck_delay=recheck_delay,
retry_before_notification=retry_before_notification,
already_retried=False,
)
logger.debug("Adding a new task in the db: %s", task)

View file

@ -7,7 +7,7 @@ from sqlalchemy.orm import Session
from argos.logging import logger
from argos.schemas import AgentResult, Config, Task
from argos.server import queries
from argos.server.alerting import handle_alert
from argos.server.alerting import handle_alert, need_alert
from argos.server.routes.dependencies import get_config, get_db, verify_token
route = APIRouter()
@ -58,16 +58,26 @@ async def create_results( # pylint: disable-msg=too-many-positional-arguments
logger.error("Unable to find task %i", agent_result.task_id)
else:
last_severity = task.severity
last_severity_update = task.last_severity_update
result = await queries.create_result(db, agent_result, agent_id)
check = task.get_check()
status, severity = await check.finalize(config, result, **result.context)
result.set_status(status, severity)
task.set_times_severity_and_deselect(severity, result.submitted_at)
# Dont create an alert if the severity has not changed
if last_severity != severity:
send_notif = need_alert(
last_severity, last_severity_update, severity, status, task
)
if send_notif:
background_tasks.add_task(
handle_alert, config, result, task, severity, last_severity, request
handle_alert,
config,
result,
task,
severity,
last_severity,
request,
)
db_results.append(result)

View file

@ -39,6 +39,8 @@ def ssl_task(now):
method="GET",
task_group="GET-6-https://example.org",
check="ssl-certificate-expiration",
retry_before_notification=0,
contiguous_failures=0,
expected="on-check",
selected_at=now,
selected_by="pytest",