diff --git a/CHANGELOG.md b/CHANGELOG.md index 4ada528..adde174 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,6 +4,7 @@ - ✨ — IPv4/IPv6 choice for checks, and choice for a dual-stack check (#69) - ⚡ — Mutualize check requests (#68) +- ✨ — Ability to delay notification after X failures (#71) ## 0.6.1 diff --git a/argos/agent.py b/argos/agent.py index ac26c93..23c62fc 100644 --- a/argos/agent.py +++ b/argos/agent.py @@ -57,7 +57,7 @@ class ArgosAgent: # pylint: disable-msg=too-many-instance-attributes self._http_client = httpx.AsyncClient(headers=auth_header) ua_header = { - "User-Agent": f"Argos Panoptes {VERSION} " + "User-Agent": f"Prout Argos Panoptes {VERSION} " "(about: https://argos-monitoring.framasoft.org/)", } self._http_client_v4 = httpx.AsyncClient( diff --git a/argos/config-example.yaml b/argos/config-example.yaml index e44e9f3..b957502 100644 --- a/argos/config-example.yaml +++ b/argos/config-example.yaml @@ -64,6 +64,14 @@ general: # For ex., to re-try a check one minute after a failure: # recheck_delay: "1m" + # Default setting for notifications delay. + # Say you want to be warned right after a failure on a check: set it to 0 + # Say you want a second failure on the check before being warned, + # to avoid network hiccups: set it to 1 + # Can be superseeded in domain configuration + # If not present, default is 0 + # retry_before_notification: 0 + # Defaults settings for IPv4/IPv6 # Can be superseeded in domain configuration. # By default, Argos will check both IPv4 and IPv6 addresses of a domain @@ -143,6 +151,8 @@ ssl: # websites: - domain: "https://mypads.example.org" + # Wait for a second failure before sending notification + retry_before_notification: 1 paths: - path: "/mypads/" # Specify the method of the HTTP request diff --git a/argos/schemas/config.py b/argos/schemas/config.py index ae0f9f2..caeba95 100644 --- a/argos/schemas/config.py +++ b/argos/schemas/config.py @@ -121,6 +121,7 @@ class Website(BaseModel): ipv6: bool | None = None frequency: float | None = None recheck_delay: float | None = None + retry_before_notification: int | None = None paths: List[WebsitePath] @field_validator("frequency", mode="before") @@ -206,6 +207,7 @@ class General(BaseModel): ldap: LdapSettings | None = None frequency: float recheck_delay: float | None = None + retry_before_notification: int = 0 ipv4: bool = True ipv6: bool = True root_path: str = "" diff --git a/argos/schemas/models.py b/argos/schemas/models.py index 36c5fe8..b1eb33a 100644 --- a/argos/schemas/models.py +++ b/argos/schemas/models.py @@ -24,6 +24,8 @@ class Task(BaseModel): method: Method expected: str task_group: str + retry_before_notification: int + contiguous_failures: int selected_at: datetime | None selected_by: str | None diff --git a/argos/server/alerting.py b/argos/server/alerting.py index 2511ffd..1145053 100644 --- a/argos/server/alerting.py +++ b/argos/server/alerting.py @@ -11,6 +11,55 @@ import httpx from argos.checks.base import Severity from argos.logging import logger from argos.schemas.config import Config, Mail, GotifyUrl +from argos.server.models import Task + + +def need_alert( + last_severity: str, last_severity_update, severity: str, status: str, task: Task +) -> bool: + ## Create alert… or not! + send_notif = False + # Severity has changed, and no retry before notification + if last_severity != severity and task.retry_before_notification == 0: + send_notif = True + # Seems to be a first check: create a notification + elif last_severity != severity and last_severity_update is None: + send_notif = True + # As we created a notification, avoid resending it on a + # future failure + if status != "success": + task.contiguous_failures = task.retry_before_notification + # We need retry before notification, so the severity may not have changed + # since last check + elif task.retry_before_notification != 0: + # If we got a success, and we already have created a notification: + # create notification of success immediately + if ( + status == "success" + and task.contiguous_failures >= task.retry_before_notification + ): + send_notif = True + task.contiguous_failures = 0 + # The status is not a success + elif status != "success": + # This is a new failure + task.contiguous_failures += 1 + # Severity has changed, but not to success, that’s odd: + # create a notification + if ( + last_severity not in ("ok", severity) + and last_severity_update is not None + ): + send_notif = True + # As we created a notification, avoid resending it on a + # future failure + task.contiguous_failures = task.retry_before_notification + # Severity has not changed, but there has been enough failures + # to create a notification + elif task.retry_before_notification == task.contiguous_failures: + send_notif = True + + return send_notif def get_icon_from_severity(severity: str) -> str: diff --git a/argos/server/migrations/versions/80a29f64f91c_add_retries_before_notification_feature.py b/argos/server/migrations/versions/80a29f64f91c_add_retries_before_notification_feature.py new file mode 100644 index 0000000..91ec086 --- /dev/null +++ b/argos/server/migrations/versions/80a29f64f91c_add_retries_before_notification_feature.py @@ -0,0 +1,41 @@ +"""Add retries before notification feature + +Revision ID: 80a29f64f91c +Revises: 8b58ced14d6e +Create Date: 2024-12-04 17:03:35.104368 + +""" +from typing import Sequence, Union + +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision: str = "80a29f64f91c" +down_revision: Union[str, None] = "8b58ced14d6e" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + with op.batch_alter_table("tasks", schema=None) as batch_op: + batch_op.add_column( + sa.Column( + "retry_before_notification", + sa.Integer(), + server_default="0", + nullable=False, + ) + ) + batch_op.add_column( + sa.Column( + "contiguous_failures", sa.Integer(), server_default="0", nullable=False + ) + ) + + +def downgrade() -> None: + with op.batch_alter_table("tasks", schema=None) as batch_op: + batch_op.drop_column("contiguous_failures") + batch_op.drop_column("retry_before_notification") diff --git a/argos/server/models.py b/argos/server/models.py index 45b811e..c031d7b 100644 --- a/argos/server/models.py +++ b/argos/server/models.py @@ -50,6 +50,8 @@ class Task(Base): frequency: Mapped[float] = mapped_column() recheck_delay: Mapped[float] = mapped_column(nullable=True) already_retried: Mapped[bool] = mapped_column(insert_default=False) + retry_before_notification: Mapped[int] = mapped_column(insert_default=0) + contiguous_failures: Mapped[int] = mapped_column(insert_default=0) method: Mapped[Method] = mapped_column( Enum( "GET", diff --git a/argos/server/queries.py b/argos/server/queries.py index 6489dfe..0e4656c 100644 --- a/argos/server/queries.py +++ b/argos/server/queries.py @@ -106,6 +106,11 @@ async def has_config_changed(db: Session, config: schemas.Config) -> bool: same_config = False conf.val = str(config.general.recheck_delay) conf.updated_at = datetime.now() + case "general_retry_before_notification": + if conf.val != str(config.general.retry_before_notification): + same_config = False + conf.val = str(config.general.retry_before_notification) + conf.updated_at = datetime.now() db.commit() @@ -126,9 +131,15 @@ async def has_config_changed(db: Session, config: schemas.Config) -> bool: val=str(config.general.recheck_delay), updated_at=datetime.now(), ) + gen_retry_before_notif = ConfigCache( + name="general_retry_before_notification", + val=str(config.general.retry_before_notification), + updated_at=datetime.now(), + ) db.add(web_hash) db.add(gen_freq) db.add(gen_recheck) + db.add(gen_retry_before_notif) db.commit() return True @@ -150,6 +161,11 @@ async def update_from_config(db: Session, config: schemas.Config): # pylint: di domain = str(website.domain) frequency = website.frequency or config.general.frequency recheck_delay = website.recheck_delay or config.general.recheck_delay + retry_before_notification = ( + website.retry_before_notification + if website.retry_before_notification is not None + else config.general.retry_before_notification + ) ipv4 = website.ipv4 if website.ipv4 is not None else config.general.ipv4 ipv6 = website.ipv6 if website.ipv6 is not None else config.general.ipv6 if ipv4 is False and ipv6 is False: @@ -186,16 +202,25 @@ async def update_from_config(db: Session, config: schemas.Config): # pylint: di existing_task.frequency = frequency if recheck_delay != existing_task.recheck_delay: existing_task.recheck_delay = recheck_delay # type: ignore[assignment] + if ( + retry_before_notification + != existing_task.retry_before_notification + ): + existing_task.retry_before_notification = ( + retry_before_notification + ) logger.debug( "Skipping db task creation for url=%s, " "method=%s, check_key=%s, expected=%s, " - "frequency=%s, recheck_delay=%s, ip_version=%s.", + "frequency=%s, recheck_delay=%s, " + "retry_before_notification=%s, ip_version=%s.", url, p.method, check_key, expected, frequency, recheck_delay, + retry_before_notification, ip_version, ) @@ -212,6 +237,7 @@ async def update_from_config(db: Session, config: schemas.Config): # pylint: di expected=expected, frequency=frequency, recheck_delay=recheck_delay, + retry_before_notification=retry_before_notification, already_retried=False, ) logger.debug("Adding a new task in the db: %s", task) diff --git a/argos/server/routes/api.py b/argos/server/routes/api.py index cc96132..78dd83d 100644 --- a/argos/server/routes/api.py +++ b/argos/server/routes/api.py @@ -7,7 +7,7 @@ from sqlalchemy.orm import Session from argos.logging import logger from argos.schemas import AgentResult, Config, Task from argos.server import queries -from argos.server.alerting import handle_alert +from argos.server.alerting import handle_alert, need_alert from argos.server.routes.dependencies import get_config, get_db, verify_token route = APIRouter() @@ -58,16 +58,26 @@ async def create_results( # pylint: disable-msg=too-many-positional-arguments logger.error("Unable to find task %i", agent_result.task_id) else: last_severity = task.severity + last_severity_update = task.last_severity_update result = await queries.create_result(db, agent_result, agent_id) check = task.get_check() status, severity = await check.finalize(config, result, **result.context) result.set_status(status, severity) task.set_times_severity_and_deselect(severity, result.submitted_at) - # Don’t create an alert if the severity has not changed - if last_severity != severity: + send_notif = need_alert( + last_severity, last_severity_update, severity, status, task + ) + + if send_notif: background_tasks.add_task( - handle_alert, config, result, task, severity, last_severity, request + handle_alert, + config, + result, + task, + severity, + last_severity, + request, ) db_results.append(result) diff --git a/tests/test_checks.py b/tests/test_checks.py index 460d5bf..18f3a37 100644 --- a/tests/test_checks.py +++ b/tests/test_checks.py @@ -39,6 +39,8 @@ def ssl_task(now): method="GET", task_group="GET-6-https://example.org", check="ssl-certificate-expiration", + retry_before_notification=0, + contiguous_failures=0, expected="on-check", selected_at=now, selected_by="pytest",