feat: delete datalayer's files on delete

Until now, uMap was not deleting files on delete, which can increase
file storage a lot after some time.

The files are not deleted, but moved to a "purgatory" folder, from
where they can be deleted after some time.
This commit is contained in:
Yohan Boniface 2024-09-21 09:08:36 +02:00
parent 86a8bbafa2
commit 49eb121c68
6 changed files with 111 additions and 2 deletions

View file

@ -282,6 +282,12 @@ How many total maps to return in the search.
How many maps to show in the user "my maps" page. How many maps to show in the user "my maps" page.
#### UMAP_PURGATORY_ROOT
Path where files are moved when a datalayer is deleted. They will stay there until
`umap purge_purgatory` is run. May be useful in case a user deletes by mistake
a datalayer, or even a map.
#### UMAP_SEARCH_CONFIGURATION #### UMAP_SEARCH_CONFIGURATION
Use it if you take control over the search configuration. Use it if you take control over the search configuration.

View file

@ -0,0 +1,28 @@
import time
from pathlib import Path
from django.conf import settings
from django.core.management.base import BaseCommand
class Command(BaseCommand):
help = "Remove old files from purgatory. Eg.: umap purge_purgatory --days 7"
def add_arguments(self, parser):
parser.add_argument(
"--days",
help="Number of days to consider files for removal",
default=30,
type=int,
)
def handle(self, *args, **options):
days = options["days"]
root = Path(settings.UMAP_PURGATORY_ROOT)
threshold = time.time() - days * 86400
for path in root.iterdir():
stats = path.stat()
filestamp = stats.st_mtime
if filestamp < threshold:
path.unlink()
print(f"Removed old file {path}")

View file

@ -3,6 +3,7 @@ import operator
import os import os
import time import time
import uuid import uuid
from pathlib import Path
from django.conf import settings from django.conf import settings
from django.contrib.auth.models import User from django.contrib.auth.models import User
@ -255,6 +256,13 @@ class Map(NamedModel):
) )
return map_settings return map_settings
def delete(self, **kwargs):
# Explicitely call datalayers.delete, so we can deal with removing files
# (the cascade delete would not call the model delete method)
for datalayer in self.datalayer_set.all():
datalayer.delete()
return super().delete(**kwargs)
def generate_umapjson(self, request): def generate_umapjson(self, request):
umapjson = self.settings umapjson = self.settings
umapjson["type"] = "umap" umapjson["type"] = "umap"
@ -462,7 +470,9 @@ class DataLayer(NamedModel):
def save(self, force_insert=False, force_update=False, **kwargs): def save(self, force_insert=False, force_update=False, **kwargs):
is_new = not bool(self.pk) is_new = not bool(self.pk)
super(DataLayer, self).save(force_insert, force_update, **kwargs) super(DataLayer, self).save(
force_insert=force_insert, force_update=force_update, **kwargs
)
if is_new: if is_new:
force_insert, force_update = False, True force_insert, force_update = False, True
@ -471,10 +481,25 @@ class DataLayer(NamedModel):
new_name = self.geojson.storage.save(filename, self.geojson) new_name = self.geojson.storage.save(filename, self.geojson)
self.geojson.storage.delete(old_name) self.geojson.storage.delete(old_name)
self.geojson.name = new_name self.geojson.name = new_name
super(DataLayer, self).save(force_insert, force_update, **kwargs) super(DataLayer, self).save(
force_insert=force_insert, force_update=force_update, **kwargs
)
self.purge_gzip() self.purge_gzip()
self.purge_old_versions() self.purge_old_versions()
def delete(self, **kwargs):
self.purge_gzip()
self.to_purgatory()
return super().delete(**kwargs)
def to_purgatory(self):
dest = Path(settings.UMAP_PURGATORY_ROOT)
dest.mkdir(parents=True, exist_ok=True)
src = Path(self.geojson.storage.location) / self.storage_root()
for version in self.versions:
name = version["name"]
(src / name).rename(dest / f"{self.map.pk}_{name}")
def upload_to(self): def upload_to(self):
root = self.storage_root() root = self.storage_root()
name = "%s_%s.geojson" % (self.pk, int(time.time() * 1000)) name = "%s_%s.geojson" % (self.pk, int(time.time() * 1000))

View file

@ -267,6 +267,7 @@ UMAP_DEFAULT_FEATURES_HAVE_OWNERS = False
UMAP_HOME_FEED = "latest" UMAP_HOME_FEED = "latest"
UMAP_IMPORTERS = {} UMAP_IMPORTERS = {}
UMAP_HOST_INFOS = {} UMAP_HOST_INFOS = {}
UMAP_PURGATORY_ROOT = "/tmp/umappurgatory"
UMAP_READONLY = env("UMAP_READONLY", default=False) UMAP_READONLY = env("UMAP_READONLY", default=False)
UMAP_GZIP = True UMAP_GZIP = True

View file

@ -1,3 +1,4 @@
import tempfile
from pathlib import Path from pathlib import Path
import pytest import pytest
@ -269,3 +270,26 @@ def test_anonymous_can_edit_in_inherit_mode_and_map_in_public_mode(
map.save() map.save()
fake_request.user = AnonymousUser() fake_request.user = AnonymousUser()
assert datalayer.can_edit(fake_request) assert datalayer.can_edit(fake_request)
def test_should_remove_all_versions_on_delete(map, settings):
settings.UMAP_PURGATORY_ROOT = tempfile.mkdtemp()
datalayer = DataLayerFactory(uuid="0f1161c0-c07f-4ba4-86c5-8d8981d8a813", old_id=17)
root = Path(datalayer.storage_root())
before = len(datalayer.geojson.storage.listdir(root)[1])
other = "123456_1440918637.geojson"
files = [
f"{datalayer.pk}_1440924889.geojson",
f"{datalayer.pk}_1440923687.geojson",
f"{datalayer.pk}_1440918637.geojson",
f"{datalayer.old_id}_1440918537.geojson",
other,
]
for path in files:
datalayer.geojson.storage.save(root / path, ContentFile("{}"))
datalayer.geojson.storage.save(root / f"{path}.gz", ContentFile("{}"))
assert len(datalayer.geojson.storage.listdir(root)[1]) == 10 + before
datalayer.delete()
found = datalayer.geojson.storage.listdir(root)[1]
assert found == [other, f"{other}.gz"]
assert len(list(Path(settings.UMAP_PURGATORY_ROOT).iterdir())) == 4 + before

View file

@ -0,0 +1,25 @@
import os
import tempfile
from pathlib import Path
from django.core.management import call_command
def test_purge_purgatory(settings):
settings.UMAP_PURGATORY_ROOT = tempfile.mkdtemp()
root = Path(settings.UMAP_PURGATORY_ROOT)
old = root / "old.json"
old.write_text("{}")
stat = old.stat()
os.utime(old, times=(stat.st_mtime - 31 * 86400, stat.st_mtime - 31 * 86400))
recent = root / "recent.json"
recent.write_text("{}")
stat = recent.stat()
os.utime(recent, times=(stat.st_mtime - 8 * 86400, stat.st_mtime - 8 * 86400))
now = root / "now.json"
now.write_text("{}")
assert {f.name for f in root.iterdir()} == {"old.json", "recent.json", "now.json"}
call_command("purge_purgatory")
assert {f.name for f in root.iterdir()} == {"recent.json", "now.json"}
call_command("purge_purgatory", "--days=7")
assert {f.name for f in root.iterdir()} == {"now.json"}