— No need cron tasks for DB cleaning anymore (fix #74 and #75)

This commit is contained in:
Luc Didry 2025-02-17 10:46:01 +01:00
parent 73e7a8f414
commit c98cd9c017
No known key found for this signature in database
GPG key ID: EA868E12D0257E3C
12 changed files with 109 additions and 124 deletions

View file

@ -4,6 +4,7 @@
- ✨ — Allow to customize agent User-Agent header (#78) - ✨ — Allow to customize agent User-Agent header (#78)
- 📝 — Document how to add data to requests (#77) - 📝 — Document how to add data to requests (#77)
- ✨ — No need cron tasks for DB cleaning anymore (#74 and #75)
## 0.7.4 ## 0.7.4

View file

@ -140,73 +140,19 @@ def start(host, port, config, reload):
uvicorn.run("argos.server:app", host=host, port=port, reload=reload) uvicorn.run("argos.server:app", host=host, port=port, reload=reload)
def validate_max_lock_seconds(ctx, param, value): def validate_time_without_agent(ctx, param, value):
if value <= 60:
raise click.BadParameter("Should be strictly higher than 60")
return value
def validate_max_results(ctx, param, value):
if value <= 0: if value <= 0:
raise click.BadParameter("Should be a positive integer") raise click.BadParameter("Should be a positive integer")
return value return value
@server.command()
@click.option(
"--max-results",
default=100,
help="Number of results per task to keep",
callback=validate_max_results,
)
@click.option(
"--max-lock-seconds",
default=100,
help=(
"The number of seconds after which a lock is "
"considered stale, must be higher than 60 "
"(the checks have a timeout value of 60 seconds)"
),
callback=validate_max_lock_seconds,
)
@click.option(
"--config",
default="argos-config.yaml",
help="Path of the configuration file. "
"If ARGOS_YAML_FILE environment variable is set, its value will be used instead. "
"Default value: argos-config.yaml and /etc/argos/config.yaml as fallback.",
envvar="ARGOS_YAML_FILE",
callback=validate_config_access,
)
@coroutine
async def cleandb(max_results, max_lock_seconds, config):
"""Clean the database (to run routinely)
\b
- Removes old results from the database.
- Removes locks from tasks that have been locked for too long.
"""
# Its mandatory to do it before the imports
os.environ["ARGOS_YAML_FILE"] = config
# The imports are made here otherwise the agent will need server configuration files.
from argos.server import queries
db = await get_db()
removed = await queries.remove_old_results(db, max_results)
updated = await queries.release_old_locks(db, max_lock_seconds)
click.echo(f"{removed} results removed")
click.echo(f"{updated} locks released")
@server.command() @server.command()
@click.option( @click.option(
"--time-without-agent", "--time-without-agent",
default=5, default=5,
help="Time without seeing an agent after which a warning will be issued, in minutes. " help="Time without seeing an agent after which a warning will be issued, in minutes. "
"Default is 5 minutes.", "Default is 5 minutes.",
callback=validate_max_results, callback=validate_time_without_agent,
) )
@click.option( @click.option(
"--config", "--config",

View file

@ -81,6 +81,12 @@ general:
# To disable the IPv6 check of domains: # To disable the IPv6 check of domains:
# ipv6: false # ipv6: false
# Argos root path
# If not present, default value is ""
# Set it to /foo if you want to use argos at /foo/ instead of /
# on your web server
# root_path: "/foo"
# Which way do you want to be warned when a check goes to that severity? # Which way do you want to be warned when a check goes to that severity?
# "local" emits a message in the server log # "local" emits a message in the server log
# Youll need to configure mail, gotify or apprise below to be able to use # Youll need to configure mail, gotify or apprise below to be able to use
@ -96,11 +102,6 @@ general:
- local - local
unknown: unknown:
- local - local
# Argos root path
# If not present, default value is ""
# Set it to /foo if you want to use argos at /foo/ instead of /
# on your web server
# root_path: "/foo"
# Mail configuration is quite straight-forward # Mail configuration is quite straight-forward
# mail: # mail:
# mailfrom: no-reply@example.org # mailfrom: no-reply@example.org
@ -144,6 +145,16 @@ ssl:
- "1d": critical - "1d": critical
- "5d": warning - "5d": warning
# Argos will do some cleaning in the background for you
# every 2 minutes and needs some configuration for that
cleaning:
# Max number of results per tasks you want to keep
# Minimum value is 1, default is 100
max_results: 100
# Max number of seconds a task can be locked
# Minimum value is 61, default is 100
max_lock_seconds: 100
# It's also possible to define the checks in another file # It's also possible to define the checks in another file
# with the include syntax: # with the include syntax:
# #

View file

@ -14,9 +14,10 @@ logger = logging.getLogger(__name__)
# XXX Does not work ? # XXX Does not work ?
def set_log_level(log_level): def set_log_level(log_level: str, quiet: bool = False):
level = getattr(logging, log_level.upper(), None) level = getattr(logging, log_level.upper(), None)
if not isinstance(level, int): if not isinstance(level, int):
raise ValueError(f"Invalid log level: {log_level}") raise ValueError(f"Invalid log level: {log_level}")
logger.setLevel(level=level) logger.setLevel(level=level)
if not quiet:
logger.info("Log level set to %s", log_level) logger.info("Log level set to %s", log_level)

View file

@ -48,6 +48,27 @@ class SSL(BaseModel):
thresholds: List[Annotated[Tuple[int, Severity], BeforeValidator(parse_threshold)]] thresholds: List[Annotated[Tuple[int, Severity], BeforeValidator(parse_threshold)]]
class Cleaning(BaseModel):
max_results: int
max_lock_seconds: int
@field_validator("max_results", mode="before")
def parse_max_results(cls, value):
"""Ensure that max_results is higher than 0"""
if value >= 1:
return value
return 100
@field_validator("max_lock_seconds", mode="before")
def parse_max_lock_seconds(cls, value):
"""Ensure that max_lock_seconds is higher or equal to agents requests timeout (60)"""
if value > 60:
return value
return 100
class WebsiteCheck(BaseModel): class WebsiteCheck(BaseModel):
key: str key: str
value: str | List[str] | Dict[str, str] value: str | List[str] | Dict[str, str]
@ -264,4 +285,5 @@ class Config(BaseModel):
general: General general: General
service: Service service: Service
ssl: SSL ssl: SSL
cleaning: Cleaning
websites: List[Website] websites: List[Website]

View file

@ -6,11 +6,12 @@ from pathlib import Path
from fastapi import FastAPI from fastapi import FastAPI
from fastapi.staticfiles import StaticFiles from fastapi.staticfiles import StaticFiles
from fastapi_login import LoginManager from fastapi_login import LoginManager
from fastapi_utils.tasks import repeat_every
from pydantic import ValidationError from pydantic import ValidationError
from sqlalchemy import create_engine, event from sqlalchemy import create_engine, event
from sqlalchemy.orm import sessionmaker from sqlalchemy.orm import sessionmaker
from argos.logging import logger from argos.logging import logger, set_log_level
from argos.server import models, routes, queries from argos.server import models, routes, queries
from argos.server.exceptions import NotAuthenticatedException, auth_exception_handler from argos.server.exceptions import NotAuthenticatedException, auth_exception_handler
from argos.server.settings import read_yaml_config from argos.server.settings import read_yaml_config
@ -126,8 +127,24 @@ def create_manager(cookie_secret: str) -> LoginManager:
) )
@repeat_every(seconds=120, logger=logger)
async def cleanup() -> None:
set_log_level("info", quiet=True)
logger.info("Start DB cleanup tasks.")
with app.state.SessionLocal() as db:
removed = await queries.remove_old_results(
db, app.state.config.cleaning.max_results
)
updated = await queries.release_old_locks(
db, app.state.config.cleaning.max_lock_seconds
)
logger.info("%i results removed", removed)
logger.info("%i locks released", updated)
@asynccontextmanager @asynccontextmanager
async def lifespan(appli): async def lifespan(appli: FastAPI):
"""Server start and stop actions """Server start and stop actions
Setup database connection then close it at shutdown. Setup database connection then close it at shutdown.
@ -142,6 +159,7 @@ async def lifespan(appli):
"There is no tasks in the database. " "There is no tasks in the database. "
'Please launch the command "argos server reload-config"' 'Please launch the command "argos server reload-config"'
) )
await cleanup()
yield yield

View file

@ -84,7 +84,6 @@ Options:
--help Show this message and exit. --help Show this message and exit.
Commands: Commands:
cleandb Clean the database (to run routinely)
generate-config Output a self-documented example config file. generate-config Output a self-documented example config file.
generate-token Generate a token for agents generate-token Generate a token for agents
migrate Run database migrations migrate Run database migrations
@ -152,59 +151,27 @@ Options:
--> -->
### Server cleandb
<!--
.. [[[cog
help(["server", "cleandb", "--help"])
.. ]]] -->
```man
Usage: argos server cleandb [OPTIONS]
Clean the database (to run routinely)
- Removes old results from the database.
- Removes locks from tasks that have been locked for too long.
Options:
--max-results INTEGER Number of results per task to keep
--max-lock-seconds INTEGER The number of seconds after which a lock is
considered stale, must be higher than 60 (the
checks have a timeout value of 60 seconds)
--config TEXT Path of the configuration file. If ARGOS_YAML_FILE
environment variable is set, its value will be
used instead. Default value: argos-config.yaml and
/etc/argos/config.yaml as fallback.
--help Show this message and exit.
```
<!--[[[end]]]
-->
### Server watch-agents ### Server watch-agents
<!-- <!--
.. [[[cog .. [[[cog
help(["server", "cleandb", "--help"]) help(["server", "watch-agents", "--help"])
.. ]]] --> .. ]]] -->
```man ```man
Usage: argos server cleandb [OPTIONS] Usage: argos server watch-agents [OPTIONS]
Clean the database (to run routinely) Watch agents (to run routinely)
- Removes old results from the database. Issues a warning if no agent has been seen by the server for a given time.
- Removes locks from tasks that have been locked for too long.
Options: Options:
--max-results INTEGER Number of results per task to keep --time-without-agent INTEGER Time without seeing an agent after which a
--max-lock-seconds INTEGER The number of seconds after which a lock is warning will be issued, in minutes. Default is 5
considered stale, must be higher than 60 (the minutes.
checks have a timeout value of 60 seconds) --config TEXT Path of the configuration file. If
--config TEXT Path of the configuration file. If ARGOS_YAML_FILE ARGOS_YAML_FILE environment variable is set, its
environment variable is set, its value will be value will be used instead.
used instead. Default value: argos-config.yaml and
/etc/argos/config.yaml as fallback.
--help Show this message and exit. --help Show this message and exit.
``` ```

View file

@ -191,18 +191,6 @@ The only requirement is that the agent can reach the server through HTTP or HTTP
argos agent http://localhost:8000 "auth-token" argos agent http://localhost:8000 "auth-token"
``` ```
## Cleaning the database
You have to run cleaning task periodically. `argos server cleandb --help` will give you more information on how to do that.
Here is a crontab example, which will clean the db each hour:
```bash
# Run the cleaning tasks every hour (at minute 7)
# Keeps 10 results per task, and remove tasks locks older than 1 hour
7 * * * * argos server cleandb --max-results 10 --max-lock-seconds 3600
```
## Watch the agents ## Watch the agents
In order to be sure that agents are up and communicate with the server, you can periodically run the `argos server watch-agents` command. In order to be sure that agents are up and communicate with the server, you can periodically run the `argos server watch-agents` command.

View file

@ -153,8 +153,7 @@ If all works well, you have to put some cron tasks in `argos` crontab:
```bash ```bash
cat <<EOF | crontab -u argos - cat <<EOF | crontab -u argos -
*/10 * * * * /opt/argos/venv/bin/argos server cleandb --max-lock-seconds 120 --max-results 1200 */10 * * * * /opt/argos/venv/bin/argos server watch-agents --time-without-agent 10:
*/10 * * * * /opt/argos/venv/bin/argos server watch-agents --time-without-agent 10
EOF EOF
``` ```

View file

@ -22,6 +22,7 @@ classifiers = [
dependencies = [ dependencies = [
"alembic>=1.13.0,<1.14", "alembic>=1.13.0,<1.14",
"fastapi-utils>=0.8.0,<0.9",
"apprise>=1.9.0,<2", "apprise>=1.9.0,<2",
"bcrypt>=4.1.3,<5", "bcrypt>=4.1.3,<5",
"click>=8.1,<9", "click>=8.1,<9",

View file

@ -1,11 +1,21 @@
--- ---
general: general:
# Except for frequency and recheck_delay settings, changes in general
# section of the configuration will need a restart of argos server.
db: db:
# The database URL, as defined in SQLAlchemy docs: # The database URL, as defined in SQLAlchemy docs:
# https://docs.sqlalchemy.org/en/20/core/engines.html#database-urls # https://docs.sqlalchemy.org/en/20/core/engines.html#database-urls
url: "sqlite:////tmp/test-argos.db" url: "sqlite:////tmp/test-argos.db"
# Can be "production", "dev", "test".
# If not present, default value is "production"
env: test env: test
# To get a good string for cookie_secret, run:
# openssl rand -hex 32
cookie_secret: "foo-bar-baz" cookie_secret: "foo-bar-baz"
# Default delay for checks.
# Can be superseeded in domain configuration.
# For ex., to run checks every 5 minutes:
frequency: "1m" frequency: "1m"
alerts: alerts:
ok: ok:
@ -18,10 +28,27 @@ general:
- local - local
service: service:
secrets: secrets:
# Secrets can be generated using `argos server generate-token`.
# You need at least one. Write them as a list, like:
# - secret_token
- "O4kt8Max9/k0EmHaEJ0CGGYbBNFmK8kOZNIoUk3Kjwc" - "O4kt8Max9/k0EmHaEJ0CGGYbBNFmK8kOZNIoUk3Kjwc"
- "x1T1VZR51pxrv5pQUyzooMG4pMUvHNMhA5y/3cUsYVs=" - "x1T1VZR51pxrv5pQUyzooMG4pMUvHNMhA5y/3cUsYVs="
ssl: ssl:
thresholds: thresholds:
- "1d": critical - "1d": critical
"5d": warning - "5d": warning
# Argos will do some cleaning in the background for you
# every 2 minutes and needs some configuration for that
cleaning:
# Max number of results per tasks you want to keep
# Minimum value is 1, default is 100
max_results: 100
# Max number of seconds a task can be locked
# Minimum value is 61, default is 100
max_lock_seconds: 100
# It's also possible to define the checks in another file
# with the include syntax:
#
websites: !include websites.yaml websites: !include websites.yaml

View file

@ -243,6 +243,10 @@ def empty_config():
] ]
), ),
ssl=schemas.config.SSL(thresholds=[]), ssl=schemas.config.SSL(thresholds=[]),
cleaning=schemas.config.Cleaning(
max_results=100,
max_lock_seconds=120,
),
websites=[], websites=[],
) )