— Ability to delay notification after X failures (fix #71)

This commit is contained in:
Luc Didry 2024-12-09 13:59:58 +01:00
parent e0edb50e12
commit 311d86d130
No known key found for this signature in database
GPG key ID: EA868E12D0257E3C
11 changed files with 151 additions and 6 deletions

View file

@ -4,6 +4,7 @@
- ✨ — IPv4/IPv6 choice for checks, and choice for a dual-stack check (#69) - ✨ — IPv4/IPv6 choice for checks, and choice for a dual-stack check (#69)
- ⚡ — Mutualize check requests (#68) - ⚡ — Mutualize check requests (#68)
- ✨ — Ability to delay notification after X failures (#71)
## 0.6.1 ## 0.6.1

View file

@ -57,7 +57,7 @@ class ArgosAgent: # pylint: disable-msg=too-many-instance-attributes
self._http_client = httpx.AsyncClient(headers=auth_header) self._http_client = httpx.AsyncClient(headers=auth_header)
ua_header = { ua_header = {
"User-Agent": f"Argos Panoptes {VERSION} " "User-Agent": f"Prout Argos Panoptes {VERSION} "
"(about: https://argos-monitoring.framasoft.org/)", "(about: https://argos-monitoring.framasoft.org/)",
} }
self._http_client_v4 = httpx.AsyncClient( self._http_client_v4 = httpx.AsyncClient(

View file

@ -64,6 +64,14 @@ general:
# For ex., to re-try a check one minute after a failure: # For ex., to re-try a check one minute after a failure:
# recheck_delay: "1m" # recheck_delay: "1m"
# Default setting for notifications delay.
# Say you want to be warned right after a failure on a check: set it to 0
# Say you want a second failure on the check before being warned,
# to avoid network hiccups: set it to 1
# Can be superseeded in domain configuration
# If not present, default is 0
# retry_before_notification: 0
# Defaults settings for IPv4/IPv6 # Defaults settings for IPv4/IPv6
# Can be superseeded in domain configuration. # Can be superseeded in domain configuration.
# By default, Argos will check both IPv4 and IPv6 addresses of a domain # By default, Argos will check both IPv4 and IPv6 addresses of a domain
@ -143,6 +151,8 @@ ssl:
# #
websites: websites:
- domain: "https://mypads.example.org" - domain: "https://mypads.example.org"
# Wait for a second failure before sending notification
retry_before_notification: 1
paths: paths:
- path: "/mypads/" - path: "/mypads/"
# Specify the method of the HTTP request # Specify the method of the HTTP request

View file

@ -121,6 +121,7 @@ class Website(BaseModel):
ipv6: bool | None = None ipv6: bool | None = None
frequency: float | None = None frequency: float | None = None
recheck_delay: float | None = None recheck_delay: float | None = None
retry_before_notification: int | None = None
paths: List[WebsitePath] paths: List[WebsitePath]
@field_validator("frequency", mode="before") @field_validator("frequency", mode="before")
@ -206,6 +207,7 @@ class General(BaseModel):
ldap: LdapSettings | None = None ldap: LdapSettings | None = None
frequency: float frequency: float
recheck_delay: float | None = None recheck_delay: float | None = None
retry_before_notification: int = 0
ipv4: bool = True ipv4: bool = True
ipv6: bool = True ipv6: bool = True
root_path: str = "" root_path: str = ""

View file

@ -24,6 +24,8 @@ class Task(BaseModel):
method: Method method: Method
expected: str expected: str
task_group: str task_group: str
retry_before_notification: int
contiguous_failures: int
selected_at: datetime | None selected_at: datetime | None
selected_by: str | None selected_by: str | None

View file

@ -11,6 +11,55 @@ import httpx
from argos.checks.base import Severity from argos.checks.base import Severity
from argos.logging import logger from argos.logging import logger
from argos.schemas.config import Config, Mail, GotifyUrl from argos.schemas.config import Config, Mail, GotifyUrl
from argos.server.models import Task
def need_alert(
last_severity: str, last_severity_update, severity: str, status: str, task: Task
) -> bool:
## Create alert… or not!
send_notif = False
# Severity has changed, and no retry before notification
if last_severity != severity and task.retry_before_notification == 0:
send_notif = True
# Seems to be a first check: create a notification
elif last_severity != severity and last_severity_update is None:
send_notif = True
# As we created a notification, avoid resending it on a
# future failure
if status != "success":
task.contiguous_failures = task.retry_before_notification
# We need retry before notification, so the severity may not have changed
# since last check
elif task.retry_before_notification != 0:
# If we got a success, and we already have created a notification:
# create notification of success immediately
if (
status == "success"
and task.contiguous_failures >= task.retry_before_notification
):
send_notif = True
task.contiguous_failures = 0
# The status is not a success
elif status != "success":
# This is a new failure
task.contiguous_failures += 1
# Severity has changed, but not to success, thats odd:
# create a notification
if (
last_severity not in ("ok", severity)
and last_severity_update is not None
):
send_notif = True
# As we created a notification, avoid resending it on a
# future failure
task.contiguous_failures = task.retry_before_notification
# Severity has not changed, but there has been enough failures
# to create a notification
elif task.retry_before_notification == task.contiguous_failures:
send_notif = True
return send_notif
def get_icon_from_severity(severity: str) -> str: def get_icon_from_severity(severity: str) -> str:

View file

@ -0,0 +1,41 @@
"""Add retries before notification feature
Revision ID: 80a29f64f91c
Revises: 8b58ced14d6e
Create Date: 2024-12-04 17:03:35.104368
"""
from typing import Sequence, Union
from alembic import op
import sqlalchemy as sa
# revision identifiers, used by Alembic.
revision: str = "80a29f64f91c"
down_revision: Union[str, None] = "8b58ced14d6e"
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None
def upgrade() -> None:
with op.batch_alter_table("tasks", schema=None) as batch_op:
batch_op.add_column(
sa.Column(
"retry_before_notification",
sa.Integer(),
server_default="0",
nullable=False,
)
)
batch_op.add_column(
sa.Column(
"contiguous_failures", sa.Integer(), server_default="0", nullable=False
)
)
def downgrade() -> None:
with op.batch_alter_table("tasks", schema=None) as batch_op:
batch_op.drop_column("contiguous_failures")
batch_op.drop_column("retry_before_notification")

View file

@ -50,6 +50,8 @@ class Task(Base):
frequency: Mapped[float] = mapped_column() frequency: Mapped[float] = mapped_column()
recheck_delay: Mapped[float] = mapped_column(nullable=True) recheck_delay: Mapped[float] = mapped_column(nullable=True)
already_retried: Mapped[bool] = mapped_column(insert_default=False) already_retried: Mapped[bool] = mapped_column(insert_default=False)
retry_before_notification: Mapped[int] = mapped_column(insert_default=0)
contiguous_failures: Mapped[int] = mapped_column(insert_default=0)
method: Mapped[Method] = mapped_column( method: Mapped[Method] = mapped_column(
Enum( Enum(
"GET", "GET",

View file

@ -106,6 +106,11 @@ async def has_config_changed(db: Session, config: schemas.Config) -> bool:
same_config = False same_config = False
conf.val = str(config.general.recheck_delay) conf.val = str(config.general.recheck_delay)
conf.updated_at = datetime.now() conf.updated_at = datetime.now()
case "general_retry_before_notification":
if conf.val != str(config.general.retry_before_notification):
same_config = False
conf.val = str(config.general.retry_before_notification)
conf.updated_at = datetime.now()
db.commit() db.commit()
@ -126,9 +131,15 @@ async def has_config_changed(db: Session, config: schemas.Config) -> bool:
val=str(config.general.recheck_delay), val=str(config.general.recheck_delay),
updated_at=datetime.now(), updated_at=datetime.now(),
) )
gen_retry_before_notif = ConfigCache(
name="general_retry_before_notification",
val=str(config.general.retry_before_notification),
updated_at=datetime.now(),
)
db.add(web_hash) db.add(web_hash)
db.add(gen_freq) db.add(gen_freq)
db.add(gen_recheck) db.add(gen_recheck)
db.add(gen_retry_before_notif)
db.commit() db.commit()
return True return True
@ -150,6 +161,11 @@ async def update_from_config(db: Session, config: schemas.Config): # pylint: di
domain = str(website.domain) domain = str(website.domain)
frequency = website.frequency or config.general.frequency frequency = website.frequency or config.general.frequency
recheck_delay = website.recheck_delay or config.general.recheck_delay recheck_delay = website.recheck_delay or config.general.recheck_delay
retry_before_notification = (
website.retry_before_notification
if website.retry_before_notification is not None
else config.general.retry_before_notification
)
ipv4 = website.ipv4 if website.ipv4 is not None else config.general.ipv4 ipv4 = website.ipv4 if website.ipv4 is not None else config.general.ipv4
ipv6 = website.ipv6 if website.ipv6 is not None else config.general.ipv6 ipv6 = website.ipv6 if website.ipv6 is not None else config.general.ipv6
if ipv4 is False and ipv6 is False: if ipv4 is False and ipv6 is False:
@ -186,16 +202,25 @@ async def update_from_config(db: Session, config: schemas.Config): # pylint: di
existing_task.frequency = frequency existing_task.frequency = frequency
if recheck_delay != existing_task.recheck_delay: if recheck_delay != existing_task.recheck_delay:
existing_task.recheck_delay = recheck_delay # type: ignore[assignment] existing_task.recheck_delay = recheck_delay # type: ignore[assignment]
if (
retry_before_notification
!= existing_task.retry_before_notification
):
existing_task.retry_before_notification = (
retry_before_notification
)
logger.debug( logger.debug(
"Skipping db task creation for url=%s, " "Skipping db task creation for url=%s, "
"method=%s, check_key=%s, expected=%s, " "method=%s, check_key=%s, expected=%s, "
"frequency=%s, recheck_delay=%s, ip_version=%s.", "frequency=%s, recheck_delay=%s, "
"retry_before_notification=%s, ip_version=%s.",
url, url,
p.method, p.method,
check_key, check_key,
expected, expected,
frequency, frequency,
recheck_delay, recheck_delay,
retry_before_notification,
ip_version, ip_version,
) )
@ -212,6 +237,7 @@ async def update_from_config(db: Session, config: schemas.Config): # pylint: di
expected=expected, expected=expected,
frequency=frequency, frequency=frequency,
recheck_delay=recheck_delay, recheck_delay=recheck_delay,
retry_before_notification=retry_before_notification,
already_retried=False, already_retried=False,
) )
logger.debug("Adding a new task in the db: %s", task) logger.debug("Adding a new task in the db: %s", task)

View file

@ -7,7 +7,7 @@ from sqlalchemy.orm import Session
from argos.logging import logger from argos.logging import logger
from argos.schemas import AgentResult, Config, Task from argos.schemas import AgentResult, Config, Task
from argos.server import queries from argos.server import queries
from argos.server.alerting import handle_alert from argos.server.alerting import handle_alert, need_alert
from argos.server.routes.dependencies import get_config, get_db, verify_token from argos.server.routes.dependencies import get_config, get_db, verify_token
route = APIRouter() route = APIRouter()
@ -58,16 +58,26 @@ async def create_results( # pylint: disable-msg=too-many-positional-arguments
logger.error("Unable to find task %i", agent_result.task_id) logger.error("Unable to find task %i", agent_result.task_id)
else: else:
last_severity = task.severity last_severity = task.severity
last_severity_update = task.last_severity_update
result = await queries.create_result(db, agent_result, agent_id) result = await queries.create_result(db, agent_result, agent_id)
check = task.get_check() check = task.get_check()
status, severity = await check.finalize(config, result, **result.context) status, severity = await check.finalize(config, result, **result.context)
result.set_status(status, severity) result.set_status(status, severity)
task.set_times_severity_and_deselect(severity, result.submitted_at) task.set_times_severity_and_deselect(severity, result.submitted_at)
# Dont create an alert if the severity has not changed send_notif = need_alert(
if last_severity != severity: last_severity, last_severity_update, severity, status, task
)
if send_notif:
background_tasks.add_task( background_tasks.add_task(
handle_alert, config, result, task, severity, last_severity, request handle_alert,
config,
result,
task,
severity,
last_severity,
request,
) )
db_results.append(result) db_results.append(result)

View file

@ -39,6 +39,8 @@ def ssl_task(now):
method="GET", method="GET",
task_group="GET-6-https://example.org", task_group="GET-6-https://example.org",
check="ssl-certificate-expiration", check="ssl-certificate-expiration",
retry_before_notification=0,
contiguous_failures=0,
expected="on-check", expected="on-check",
selected_at=now, selected_at=now,
selected_by="pytest", selected_by="pytest",