mirror of
https://framagit.org/framasoft/framaspace/argos.git
synced 2025-04-28 18:02:41 +02:00
✨ — Ability to delay notification after X failures (fix #71)
This commit is contained in:
parent
e0edb50e12
commit
311d86d130
11 changed files with 151 additions and 6 deletions
|
@ -4,6 +4,7 @@
|
||||||
|
|
||||||
- ✨ — IPv4/IPv6 choice for checks, and choice for a dual-stack check (#69)
|
- ✨ — IPv4/IPv6 choice for checks, and choice for a dual-stack check (#69)
|
||||||
- ⚡ — Mutualize check requests (#68)
|
- ⚡ — Mutualize check requests (#68)
|
||||||
|
- ✨ — Ability to delay notification after X failures (#71)
|
||||||
|
|
||||||
## 0.6.1
|
## 0.6.1
|
||||||
|
|
||||||
|
|
|
@ -57,7 +57,7 @@ class ArgosAgent: # pylint: disable-msg=too-many-instance-attributes
|
||||||
self._http_client = httpx.AsyncClient(headers=auth_header)
|
self._http_client = httpx.AsyncClient(headers=auth_header)
|
||||||
|
|
||||||
ua_header = {
|
ua_header = {
|
||||||
"User-Agent": f"Argos Panoptes {VERSION} "
|
"User-Agent": f"Prout Argos Panoptes {VERSION} "
|
||||||
"(about: https://argos-monitoring.framasoft.org/)",
|
"(about: https://argos-monitoring.framasoft.org/)",
|
||||||
}
|
}
|
||||||
self._http_client_v4 = httpx.AsyncClient(
|
self._http_client_v4 = httpx.AsyncClient(
|
||||||
|
|
|
@ -64,6 +64,14 @@ general:
|
||||||
# For ex., to re-try a check one minute after a failure:
|
# For ex., to re-try a check one minute after a failure:
|
||||||
# recheck_delay: "1m"
|
# recheck_delay: "1m"
|
||||||
|
|
||||||
|
# Default setting for notifications delay.
|
||||||
|
# Say you want to be warned right after a failure on a check: set it to 0
|
||||||
|
# Say you want a second failure on the check before being warned,
|
||||||
|
# to avoid network hiccups: set it to 1
|
||||||
|
# Can be superseeded in domain configuration
|
||||||
|
# If not present, default is 0
|
||||||
|
# retry_before_notification: 0
|
||||||
|
|
||||||
# Defaults settings for IPv4/IPv6
|
# Defaults settings for IPv4/IPv6
|
||||||
# Can be superseeded in domain configuration.
|
# Can be superseeded in domain configuration.
|
||||||
# By default, Argos will check both IPv4 and IPv6 addresses of a domain
|
# By default, Argos will check both IPv4 and IPv6 addresses of a domain
|
||||||
|
@ -143,6 +151,8 @@ ssl:
|
||||||
#
|
#
|
||||||
websites:
|
websites:
|
||||||
- domain: "https://mypads.example.org"
|
- domain: "https://mypads.example.org"
|
||||||
|
# Wait for a second failure before sending notification
|
||||||
|
retry_before_notification: 1
|
||||||
paths:
|
paths:
|
||||||
- path: "/mypads/"
|
- path: "/mypads/"
|
||||||
# Specify the method of the HTTP request
|
# Specify the method of the HTTP request
|
||||||
|
|
|
@ -121,6 +121,7 @@ class Website(BaseModel):
|
||||||
ipv6: bool | None = None
|
ipv6: bool | None = None
|
||||||
frequency: float | None = None
|
frequency: float | None = None
|
||||||
recheck_delay: float | None = None
|
recheck_delay: float | None = None
|
||||||
|
retry_before_notification: int | None = None
|
||||||
paths: List[WebsitePath]
|
paths: List[WebsitePath]
|
||||||
|
|
||||||
@field_validator("frequency", mode="before")
|
@field_validator("frequency", mode="before")
|
||||||
|
@ -206,6 +207,7 @@ class General(BaseModel):
|
||||||
ldap: LdapSettings | None = None
|
ldap: LdapSettings | None = None
|
||||||
frequency: float
|
frequency: float
|
||||||
recheck_delay: float | None = None
|
recheck_delay: float | None = None
|
||||||
|
retry_before_notification: int = 0
|
||||||
ipv4: bool = True
|
ipv4: bool = True
|
||||||
ipv6: bool = True
|
ipv6: bool = True
|
||||||
root_path: str = ""
|
root_path: str = ""
|
||||||
|
|
|
@ -24,6 +24,8 @@ class Task(BaseModel):
|
||||||
method: Method
|
method: Method
|
||||||
expected: str
|
expected: str
|
||||||
task_group: str
|
task_group: str
|
||||||
|
retry_before_notification: int
|
||||||
|
contiguous_failures: int
|
||||||
selected_at: datetime | None
|
selected_at: datetime | None
|
||||||
selected_by: str | None
|
selected_by: str | None
|
||||||
|
|
||||||
|
|
|
@ -11,6 +11,55 @@ import httpx
|
||||||
from argos.checks.base import Severity
|
from argos.checks.base import Severity
|
||||||
from argos.logging import logger
|
from argos.logging import logger
|
||||||
from argos.schemas.config import Config, Mail, GotifyUrl
|
from argos.schemas.config import Config, Mail, GotifyUrl
|
||||||
|
from argos.server.models import Task
|
||||||
|
|
||||||
|
|
||||||
|
def need_alert(
|
||||||
|
last_severity: str, last_severity_update, severity: str, status: str, task: Task
|
||||||
|
) -> bool:
|
||||||
|
## Create alert… or not!
|
||||||
|
send_notif = False
|
||||||
|
# Severity has changed, and no retry before notification
|
||||||
|
if last_severity != severity and task.retry_before_notification == 0:
|
||||||
|
send_notif = True
|
||||||
|
# Seems to be a first check: create a notification
|
||||||
|
elif last_severity != severity and last_severity_update is None:
|
||||||
|
send_notif = True
|
||||||
|
# As we created a notification, avoid resending it on a
|
||||||
|
# future failure
|
||||||
|
if status != "success":
|
||||||
|
task.contiguous_failures = task.retry_before_notification
|
||||||
|
# We need retry before notification, so the severity may not have changed
|
||||||
|
# since last check
|
||||||
|
elif task.retry_before_notification != 0:
|
||||||
|
# If we got a success, and we already have created a notification:
|
||||||
|
# create notification of success immediately
|
||||||
|
if (
|
||||||
|
status == "success"
|
||||||
|
and task.contiguous_failures >= task.retry_before_notification
|
||||||
|
):
|
||||||
|
send_notif = True
|
||||||
|
task.contiguous_failures = 0
|
||||||
|
# The status is not a success
|
||||||
|
elif status != "success":
|
||||||
|
# This is a new failure
|
||||||
|
task.contiguous_failures += 1
|
||||||
|
# Severity has changed, but not to success, that’s odd:
|
||||||
|
# create a notification
|
||||||
|
if (
|
||||||
|
last_severity not in ("ok", severity)
|
||||||
|
and last_severity_update is not None
|
||||||
|
):
|
||||||
|
send_notif = True
|
||||||
|
# As we created a notification, avoid resending it on a
|
||||||
|
# future failure
|
||||||
|
task.contiguous_failures = task.retry_before_notification
|
||||||
|
# Severity has not changed, but there has been enough failures
|
||||||
|
# to create a notification
|
||||||
|
elif task.retry_before_notification == task.contiguous_failures:
|
||||||
|
send_notif = True
|
||||||
|
|
||||||
|
return send_notif
|
||||||
|
|
||||||
|
|
||||||
def get_icon_from_severity(severity: str) -> str:
|
def get_icon_from_severity(severity: str) -> str:
|
||||||
|
|
|
@ -0,0 +1,41 @@
|
||||||
|
"""Add retries before notification feature
|
||||||
|
|
||||||
|
Revision ID: 80a29f64f91c
|
||||||
|
Revises: 8b58ced14d6e
|
||||||
|
Create Date: 2024-12-04 17:03:35.104368
|
||||||
|
|
||||||
|
"""
|
||||||
|
from typing import Sequence, Union
|
||||||
|
|
||||||
|
from alembic import op
|
||||||
|
import sqlalchemy as sa
|
||||||
|
|
||||||
|
|
||||||
|
# revision identifiers, used by Alembic.
|
||||||
|
revision: str = "80a29f64f91c"
|
||||||
|
down_revision: Union[str, None] = "8b58ced14d6e"
|
||||||
|
branch_labels: Union[str, Sequence[str], None] = None
|
||||||
|
depends_on: Union[str, Sequence[str], None] = None
|
||||||
|
|
||||||
|
|
||||||
|
def upgrade() -> None:
|
||||||
|
with op.batch_alter_table("tasks", schema=None) as batch_op:
|
||||||
|
batch_op.add_column(
|
||||||
|
sa.Column(
|
||||||
|
"retry_before_notification",
|
||||||
|
sa.Integer(),
|
||||||
|
server_default="0",
|
||||||
|
nullable=False,
|
||||||
|
)
|
||||||
|
)
|
||||||
|
batch_op.add_column(
|
||||||
|
sa.Column(
|
||||||
|
"contiguous_failures", sa.Integer(), server_default="0", nullable=False
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def downgrade() -> None:
|
||||||
|
with op.batch_alter_table("tasks", schema=None) as batch_op:
|
||||||
|
batch_op.drop_column("contiguous_failures")
|
||||||
|
batch_op.drop_column("retry_before_notification")
|
|
@ -50,6 +50,8 @@ class Task(Base):
|
||||||
frequency: Mapped[float] = mapped_column()
|
frequency: Mapped[float] = mapped_column()
|
||||||
recheck_delay: Mapped[float] = mapped_column(nullable=True)
|
recheck_delay: Mapped[float] = mapped_column(nullable=True)
|
||||||
already_retried: Mapped[bool] = mapped_column(insert_default=False)
|
already_retried: Mapped[bool] = mapped_column(insert_default=False)
|
||||||
|
retry_before_notification: Mapped[int] = mapped_column(insert_default=0)
|
||||||
|
contiguous_failures: Mapped[int] = mapped_column(insert_default=0)
|
||||||
method: Mapped[Method] = mapped_column(
|
method: Mapped[Method] = mapped_column(
|
||||||
Enum(
|
Enum(
|
||||||
"GET",
|
"GET",
|
||||||
|
|
|
@ -106,6 +106,11 @@ async def has_config_changed(db: Session, config: schemas.Config) -> bool:
|
||||||
same_config = False
|
same_config = False
|
||||||
conf.val = str(config.general.recheck_delay)
|
conf.val = str(config.general.recheck_delay)
|
||||||
conf.updated_at = datetime.now()
|
conf.updated_at = datetime.now()
|
||||||
|
case "general_retry_before_notification":
|
||||||
|
if conf.val != str(config.general.retry_before_notification):
|
||||||
|
same_config = False
|
||||||
|
conf.val = str(config.general.retry_before_notification)
|
||||||
|
conf.updated_at = datetime.now()
|
||||||
|
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
|
@ -126,9 +131,15 @@ async def has_config_changed(db: Session, config: schemas.Config) -> bool:
|
||||||
val=str(config.general.recheck_delay),
|
val=str(config.general.recheck_delay),
|
||||||
updated_at=datetime.now(),
|
updated_at=datetime.now(),
|
||||||
)
|
)
|
||||||
|
gen_retry_before_notif = ConfigCache(
|
||||||
|
name="general_retry_before_notification",
|
||||||
|
val=str(config.general.retry_before_notification),
|
||||||
|
updated_at=datetime.now(),
|
||||||
|
)
|
||||||
db.add(web_hash)
|
db.add(web_hash)
|
||||||
db.add(gen_freq)
|
db.add(gen_freq)
|
||||||
db.add(gen_recheck)
|
db.add(gen_recheck)
|
||||||
|
db.add(gen_retry_before_notif)
|
||||||
db.commit()
|
db.commit()
|
||||||
|
|
||||||
return True
|
return True
|
||||||
|
@ -150,6 +161,11 @@ async def update_from_config(db: Session, config: schemas.Config): # pylint: di
|
||||||
domain = str(website.domain)
|
domain = str(website.domain)
|
||||||
frequency = website.frequency or config.general.frequency
|
frequency = website.frequency or config.general.frequency
|
||||||
recheck_delay = website.recheck_delay or config.general.recheck_delay
|
recheck_delay = website.recheck_delay or config.general.recheck_delay
|
||||||
|
retry_before_notification = (
|
||||||
|
website.retry_before_notification
|
||||||
|
if website.retry_before_notification is not None
|
||||||
|
else config.general.retry_before_notification
|
||||||
|
)
|
||||||
ipv4 = website.ipv4 if website.ipv4 is not None else config.general.ipv4
|
ipv4 = website.ipv4 if website.ipv4 is not None else config.general.ipv4
|
||||||
ipv6 = website.ipv6 if website.ipv6 is not None else config.general.ipv6
|
ipv6 = website.ipv6 if website.ipv6 is not None else config.general.ipv6
|
||||||
if ipv4 is False and ipv6 is False:
|
if ipv4 is False and ipv6 is False:
|
||||||
|
@ -186,16 +202,25 @@ async def update_from_config(db: Session, config: schemas.Config): # pylint: di
|
||||||
existing_task.frequency = frequency
|
existing_task.frequency = frequency
|
||||||
if recheck_delay != existing_task.recheck_delay:
|
if recheck_delay != existing_task.recheck_delay:
|
||||||
existing_task.recheck_delay = recheck_delay # type: ignore[assignment]
|
existing_task.recheck_delay = recheck_delay # type: ignore[assignment]
|
||||||
|
if (
|
||||||
|
retry_before_notification
|
||||||
|
!= existing_task.retry_before_notification
|
||||||
|
):
|
||||||
|
existing_task.retry_before_notification = (
|
||||||
|
retry_before_notification
|
||||||
|
)
|
||||||
logger.debug(
|
logger.debug(
|
||||||
"Skipping db task creation for url=%s, "
|
"Skipping db task creation for url=%s, "
|
||||||
"method=%s, check_key=%s, expected=%s, "
|
"method=%s, check_key=%s, expected=%s, "
|
||||||
"frequency=%s, recheck_delay=%s, ip_version=%s.",
|
"frequency=%s, recheck_delay=%s, "
|
||||||
|
"retry_before_notification=%s, ip_version=%s.",
|
||||||
url,
|
url,
|
||||||
p.method,
|
p.method,
|
||||||
check_key,
|
check_key,
|
||||||
expected,
|
expected,
|
||||||
frequency,
|
frequency,
|
||||||
recheck_delay,
|
recheck_delay,
|
||||||
|
retry_before_notification,
|
||||||
ip_version,
|
ip_version,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
@ -212,6 +237,7 @@ async def update_from_config(db: Session, config: schemas.Config): # pylint: di
|
||||||
expected=expected,
|
expected=expected,
|
||||||
frequency=frequency,
|
frequency=frequency,
|
||||||
recheck_delay=recheck_delay,
|
recheck_delay=recheck_delay,
|
||||||
|
retry_before_notification=retry_before_notification,
|
||||||
already_retried=False,
|
already_retried=False,
|
||||||
)
|
)
|
||||||
logger.debug("Adding a new task in the db: %s", task)
|
logger.debug("Adding a new task in the db: %s", task)
|
||||||
|
|
|
@ -7,7 +7,7 @@ from sqlalchemy.orm import Session
|
||||||
from argos.logging import logger
|
from argos.logging import logger
|
||||||
from argos.schemas import AgentResult, Config, Task
|
from argos.schemas import AgentResult, Config, Task
|
||||||
from argos.server import queries
|
from argos.server import queries
|
||||||
from argos.server.alerting import handle_alert
|
from argos.server.alerting import handle_alert, need_alert
|
||||||
from argos.server.routes.dependencies import get_config, get_db, verify_token
|
from argos.server.routes.dependencies import get_config, get_db, verify_token
|
||||||
|
|
||||||
route = APIRouter()
|
route = APIRouter()
|
||||||
|
@ -58,16 +58,26 @@ async def create_results( # pylint: disable-msg=too-many-positional-arguments
|
||||||
logger.error("Unable to find task %i", agent_result.task_id)
|
logger.error("Unable to find task %i", agent_result.task_id)
|
||||||
else:
|
else:
|
||||||
last_severity = task.severity
|
last_severity = task.severity
|
||||||
|
last_severity_update = task.last_severity_update
|
||||||
result = await queries.create_result(db, agent_result, agent_id)
|
result = await queries.create_result(db, agent_result, agent_id)
|
||||||
check = task.get_check()
|
check = task.get_check()
|
||||||
status, severity = await check.finalize(config, result, **result.context)
|
status, severity = await check.finalize(config, result, **result.context)
|
||||||
result.set_status(status, severity)
|
result.set_status(status, severity)
|
||||||
task.set_times_severity_and_deselect(severity, result.submitted_at)
|
task.set_times_severity_and_deselect(severity, result.submitted_at)
|
||||||
|
|
||||||
# Don’t create an alert if the severity has not changed
|
send_notif = need_alert(
|
||||||
if last_severity != severity:
|
last_severity, last_severity_update, severity, status, task
|
||||||
|
)
|
||||||
|
|
||||||
|
if send_notif:
|
||||||
background_tasks.add_task(
|
background_tasks.add_task(
|
||||||
handle_alert, config, result, task, severity, last_severity, request
|
handle_alert,
|
||||||
|
config,
|
||||||
|
result,
|
||||||
|
task,
|
||||||
|
severity,
|
||||||
|
last_severity,
|
||||||
|
request,
|
||||||
)
|
)
|
||||||
|
|
||||||
db_results.append(result)
|
db_results.append(result)
|
||||||
|
|
|
@ -39,6 +39,8 @@ def ssl_task(now):
|
||||||
method="GET",
|
method="GET",
|
||||||
task_group="GET-6-https://example.org",
|
task_group="GET-6-https://example.org",
|
||||||
check="ssl-certificate-expiration",
|
check="ssl-certificate-expiration",
|
||||||
|
retry_before_notification=0,
|
||||||
|
contiguous_failures=0,
|
||||||
expected="on-check",
|
expected="on-check",
|
||||||
selected_at=now,
|
selected_at=now,
|
||||||
selected_by="pytest",
|
selected_by="pytest",
|
||||||
|
|
Loading…
Reference in a new issue