mirror of
https://framagit.org/framasoft/framaspace/argos.git
synced 2025-04-28 09:52:38 +02:00
✨ — Ability to delay notification after X failures (fix #71)
This commit is contained in:
parent
e0edb50e12
commit
311d86d130
11 changed files with 151 additions and 6 deletions
|
@ -4,6 +4,7 @@
|
|||
|
||||
- ✨ — IPv4/IPv6 choice for checks, and choice for a dual-stack check (#69)
|
||||
- ⚡ — Mutualize check requests (#68)
|
||||
- ✨ — Ability to delay notification after X failures (#71)
|
||||
|
||||
## 0.6.1
|
||||
|
||||
|
|
|
@ -57,7 +57,7 @@ class ArgosAgent: # pylint: disable-msg=too-many-instance-attributes
|
|||
self._http_client = httpx.AsyncClient(headers=auth_header)
|
||||
|
||||
ua_header = {
|
||||
"User-Agent": f"Argos Panoptes {VERSION} "
|
||||
"User-Agent": f"Prout Argos Panoptes {VERSION} "
|
||||
"(about: https://argos-monitoring.framasoft.org/)",
|
||||
}
|
||||
self._http_client_v4 = httpx.AsyncClient(
|
||||
|
|
|
@ -64,6 +64,14 @@ general:
|
|||
# For ex., to re-try a check one minute after a failure:
|
||||
# recheck_delay: "1m"
|
||||
|
||||
# Default setting for notifications delay.
|
||||
# Say you want to be warned right after a failure on a check: set it to 0
|
||||
# Say you want a second failure on the check before being warned,
|
||||
# to avoid network hiccups: set it to 1
|
||||
# Can be superseeded in domain configuration
|
||||
# If not present, default is 0
|
||||
# retry_before_notification: 0
|
||||
|
||||
# Defaults settings for IPv4/IPv6
|
||||
# Can be superseeded in domain configuration.
|
||||
# By default, Argos will check both IPv4 and IPv6 addresses of a domain
|
||||
|
@ -143,6 +151,8 @@ ssl:
|
|||
#
|
||||
websites:
|
||||
- domain: "https://mypads.example.org"
|
||||
# Wait for a second failure before sending notification
|
||||
retry_before_notification: 1
|
||||
paths:
|
||||
- path: "/mypads/"
|
||||
# Specify the method of the HTTP request
|
||||
|
|
|
@ -121,6 +121,7 @@ class Website(BaseModel):
|
|||
ipv6: bool | None = None
|
||||
frequency: float | None = None
|
||||
recheck_delay: float | None = None
|
||||
retry_before_notification: int | None = None
|
||||
paths: List[WebsitePath]
|
||||
|
||||
@field_validator("frequency", mode="before")
|
||||
|
@ -206,6 +207,7 @@ class General(BaseModel):
|
|||
ldap: LdapSettings | None = None
|
||||
frequency: float
|
||||
recheck_delay: float | None = None
|
||||
retry_before_notification: int = 0
|
||||
ipv4: bool = True
|
||||
ipv6: bool = True
|
||||
root_path: str = ""
|
||||
|
|
|
@ -24,6 +24,8 @@ class Task(BaseModel):
|
|||
method: Method
|
||||
expected: str
|
||||
task_group: str
|
||||
retry_before_notification: int
|
||||
contiguous_failures: int
|
||||
selected_at: datetime | None
|
||||
selected_by: str | None
|
||||
|
||||
|
|
|
@ -11,6 +11,55 @@ import httpx
|
|||
from argos.checks.base import Severity
|
||||
from argos.logging import logger
|
||||
from argos.schemas.config import Config, Mail, GotifyUrl
|
||||
from argos.server.models import Task
|
||||
|
||||
|
||||
def need_alert(
|
||||
last_severity: str, last_severity_update, severity: str, status: str, task: Task
|
||||
) -> bool:
|
||||
## Create alert… or not!
|
||||
send_notif = False
|
||||
# Severity has changed, and no retry before notification
|
||||
if last_severity != severity and task.retry_before_notification == 0:
|
||||
send_notif = True
|
||||
# Seems to be a first check: create a notification
|
||||
elif last_severity != severity and last_severity_update is None:
|
||||
send_notif = True
|
||||
# As we created a notification, avoid resending it on a
|
||||
# future failure
|
||||
if status != "success":
|
||||
task.contiguous_failures = task.retry_before_notification
|
||||
# We need retry before notification, so the severity may not have changed
|
||||
# since last check
|
||||
elif task.retry_before_notification != 0:
|
||||
# If we got a success, and we already have created a notification:
|
||||
# create notification of success immediately
|
||||
if (
|
||||
status == "success"
|
||||
and task.contiguous_failures >= task.retry_before_notification
|
||||
):
|
||||
send_notif = True
|
||||
task.contiguous_failures = 0
|
||||
# The status is not a success
|
||||
elif status != "success":
|
||||
# This is a new failure
|
||||
task.contiguous_failures += 1
|
||||
# Severity has changed, but not to success, that’s odd:
|
||||
# create a notification
|
||||
if (
|
||||
last_severity not in ("ok", severity)
|
||||
and last_severity_update is not None
|
||||
):
|
||||
send_notif = True
|
||||
# As we created a notification, avoid resending it on a
|
||||
# future failure
|
||||
task.contiguous_failures = task.retry_before_notification
|
||||
# Severity has not changed, but there has been enough failures
|
||||
# to create a notification
|
||||
elif task.retry_before_notification == task.contiguous_failures:
|
||||
send_notif = True
|
||||
|
||||
return send_notif
|
||||
|
||||
|
||||
def get_icon_from_severity(severity: str) -> str:
|
||||
|
|
|
@ -0,0 +1,41 @@
|
|||
"""Add retries before notification feature
|
||||
|
||||
Revision ID: 80a29f64f91c
|
||||
Revises: 8b58ced14d6e
|
||||
Create Date: 2024-12-04 17:03:35.104368
|
||||
|
||||
"""
|
||||
from typing import Sequence, Union
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision: str = "80a29f64f91c"
|
||||
down_revision: Union[str, None] = "8b58ced14d6e"
|
||||
branch_labels: Union[str, Sequence[str], None] = None
|
||||
depends_on: Union[str, Sequence[str], None] = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
with op.batch_alter_table("tasks", schema=None) as batch_op:
|
||||
batch_op.add_column(
|
||||
sa.Column(
|
||||
"retry_before_notification",
|
||||
sa.Integer(),
|
||||
server_default="0",
|
||||
nullable=False,
|
||||
)
|
||||
)
|
||||
batch_op.add_column(
|
||||
sa.Column(
|
||||
"contiguous_failures", sa.Integer(), server_default="0", nullable=False
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
with op.batch_alter_table("tasks", schema=None) as batch_op:
|
||||
batch_op.drop_column("contiguous_failures")
|
||||
batch_op.drop_column("retry_before_notification")
|
|
@ -50,6 +50,8 @@ class Task(Base):
|
|||
frequency: Mapped[float] = mapped_column()
|
||||
recheck_delay: Mapped[float] = mapped_column(nullable=True)
|
||||
already_retried: Mapped[bool] = mapped_column(insert_default=False)
|
||||
retry_before_notification: Mapped[int] = mapped_column(insert_default=0)
|
||||
contiguous_failures: Mapped[int] = mapped_column(insert_default=0)
|
||||
method: Mapped[Method] = mapped_column(
|
||||
Enum(
|
||||
"GET",
|
||||
|
|
|
@ -106,6 +106,11 @@ async def has_config_changed(db: Session, config: schemas.Config) -> bool:
|
|||
same_config = False
|
||||
conf.val = str(config.general.recheck_delay)
|
||||
conf.updated_at = datetime.now()
|
||||
case "general_retry_before_notification":
|
||||
if conf.val != str(config.general.retry_before_notification):
|
||||
same_config = False
|
||||
conf.val = str(config.general.retry_before_notification)
|
||||
conf.updated_at = datetime.now()
|
||||
|
||||
db.commit()
|
||||
|
||||
|
@ -126,9 +131,15 @@ async def has_config_changed(db: Session, config: schemas.Config) -> bool:
|
|||
val=str(config.general.recheck_delay),
|
||||
updated_at=datetime.now(),
|
||||
)
|
||||
gen_retry_before_notif = ConfigCache(
|
||||
name="general_retry_before_notification",
|
||||
val=str(config.general.retry_before_notification),
|
||||
updated_at=datetime.now(),
|
||||
)
|
||||
db.add(web_hash)
|
||||
db.add(gen_freq)
|
||||
db.add(gen_recheck)
|
||||
db.add(gen_retry_before_notif)
|
||||
db.commit()
|
||||
|
||||
return True
|
||||
|
@ -150,6 +161,11 @@ async def update_from_config(db: Session, config: schemas.Config): # pylint: di
|
|||
domain = str(website.domain)
|
||||
frequency = website.frequency or config.general.frequency
|
||||
recheck_delay = website.recheck_delay or config.general.recheck_delay
|
||||
retry_before_notification = (
|
||||
website.retry_before_notification
|
||||
if website.retry_before_notification is not None
|
||||
else config.general.retry_before_notification
|
||||
)
|
||||
ipv4 = website.ipv4 if website.ipv4 is not None else config.general.ipv4
|
||||
ipv6 = website.ipv6 if website.ipv6 is not None else config.general.ipv6
|
||||
if ipv4 is False and ipv6 is False:
|
||||
|
@ -186,16 +202,25 @@ async def update_from_config(db: Session, config: schemas.Config): # pylint: di
|
|||
existing_task.frequency = frequency
|
||||
if recheck_delay != existing_task.recheck_delay:
|
||||
existing_task.recheck_delay = recheck_delay # type: ignore[assignment]
|
||||
if (
|
||||
retry_before_notification
|
||||
!= existing_task.retry_before_notification
|
||||
):
|
||||
existing_task.retry_before_notification = (
|
||||
retry_before_notification
|
||||
)
|
||||
logger.debug(
|
||||
"Skipping db task creation for url=%s, "
|
||||
"method=%s, check_key=%s, expected=%s, "
|
||||
"frequency=%s, recheck_delay=%s, ip_version=%s.",
|
||||
"frequency=%s, recheck_delay=%s, "
|
||||
"retry_before_notification=%s, ip_version=%s.",
|
||||
url,
|
||||
p.method,
|
||||
check_key,
|
||||
expected,
|
||||
frequency,
|
||||
recheck_delay,
|
||||
retry_before_notification,
|
||||
ip_version,
|
||||
)
|
||||
|
||||
|
@ -212,6 +237,7 @@ async def update_from_config(db: Session, config: schemas.Config): # pylint: di
|
|||
expected=expected,
|
||||
frequency=frequency,
|
||||
recheck_delay=recheck_delay,
|
||||
retry_before_notification=retry_before_notification,
|
||||
already_retried=False,
|
||||
)
|
||||
logger.debug("Adding a new task in the db: %s", task)
|
||||
|
|
|
@ -7,7 +7,7 @@ from sqlalchemy.orm import Session
|
|||
from argos.logging import logger
|
||||
from argos.schemas import AgentResult, Config, Task
|
||||
from argos.server import queries
|
||||
from argos.server.alerting import handle_alert
|
||||
from argos.server.alerting import handle_alert, need_alert
|
||||
from argos.server.routes.dependencies import get_config, get_db, verify_token
|
||||
|
||||
route = APIRouter()
|
||||
|
@ -58,16 +58,26 @@ async def create_results( # pylint: disable-msg=too-many-positional-arguments
|
|||
logger.error("Unable to find task %i", agent_result.task_id)
|
||||
else:
|
||||
last_severity = task.severity
|
||||
last_severity_update = task.last_severity_update
|
||||
result = await queries.create_result(db, agent_result, agent_id)
|
||||
check = task.get_check()
|
||||
status, severity = await check.finalize(config, result, **result.context)
|
||||
result.set_status(status, severity)
|
||||
task.set_times_severity_and_deselect(severity, result.submitted_at)
|
||||
|
||||
# Don’t create an alert if the severity has not changed
|
||||
if last_severity != severity:
|
||||
send_notif = need_alert(
|
||||
last_severity, last_severity_update, severity, status, task
|
||||
)
|
||||
|
||||
if send_notif:
|
||||
background_tasks.add_task(
|
||||
handle_alert, config, result, task, severity, last_severity, request
|
||||
handle_alert,
|
||||
config,
|
||||
result,
|
||||
task,
|
||||
severity,
|
||||
last_severity,
|
||||
request,
|
||||
)
|
||||
|
||||
db_results.append(result)
|
||||
|
|
|
@ -39,6 +39,8 @@ def ssl_task(now):
|
|||
method="GET",
|
||||
task_group="GET-6-https://example.org",
|
||||
check="ssl-certificate-expiration",
|
||||
retry_before_notification=0,
|
||||
contiguous_failures=0,
|
||||
expected="on-check",
|
||||
selected_at=now,
|
||||
selected_by="pytest",
|
||||
|
|
Loading…
Reference in a new issue