feat: delete datalayer's files on delete (#2158)

Until now, uMap was not deleting files on delete, which can increase
file storage a lot after some time.

The files are not deleted, but moved to a "purgatory" folder, from where
they can be deleted after some time.
This commit is contained in:
Yohan Boniface 2024-10-04 16:41:11 +02:00 committed by GitHub
commit 5d5be52337
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 111 additions and 2 deletions

View file

@ -282,6 +282,12 @@ How many total maps to return in the search.
How many maps to show in the user "my maps" page.
#### UMAP_PURGATORY_ROOT
Path where files are moved when a datalayer is deleted. They will stay there until
`umap purge_purgatory` is run. May be useful in case a user deletes by mistake
a datalayer, or even a map.
#### UMAP_SEARCH_CONFIGURATION
Use it if you take control over the search configuration.

View file

@ -0,0 +1,28 @@
import time
from pathlib import Path
from django.conf import settings
from django.core.management.base import BaseCommand
class Command(BaseCommand):
help = "Remove old files from purgatory. Eg.: umap purge_purgatory --days 7"
def add_arguments(self, parser):
parser.add_argument(
"--days",
help="Number of days to consider files for removal",
default=30,
type=int,
)
def handle(self, *args, **options):
days = options["days"]
root = Path(settings.UMAP_PURGATORY_ROOT)
threshold = time.time() - days * 86400
for path in root.iterdir():
stats = path.stat()
filestamp = stats.st_mtime
if filestamp < threshold:
path.unlink()
print(f"Removed old file {path}")

View file

@ -3,6 +3,7 @@ import operator
import os
import time
import uuid
from pathlib import Path
from django.conf import settings
from django.contrib.auth.models import User
@ -255,6 +256,13 @@ class Map(NamedModel):
)
return map_settings
def delete(self, **kwargs):
# Explicitely call datalayers.delete, so we can deal with removing files
# (the cascade delete would not call the model delete method)
for datalayer in self.datalayer_set.all():
datalayer.delete()
return super().delete(**kwargs)
def generate_umapjson(self, request):
umapjson = self.settings
umapjson["type"] = "umap"
@ -462,7 +470,9 @@ class DataLayer(NamedModel):
def save(self, force_insert=False, force_update=False, **kwargs):
is_new = not bool(self.pk)
super(DataLayer, self).save(force_insert, force_update, **kwargs)
super(DataLayer, self).save(
force_insert=force_insert, force_update=force_update, **kwargs
)
if is_new:
force_insert, force_update = False, True
@ -471,10 +481,25 @@ class DataLayer(NamedModel):
new_name = self.geojson.storage.save(filename, self.geojson)
self.geojson.storage.delete(old_name)
self.geojson.name = new_name
super(DataLayer, self).save(force_insert, force_update, **kwargs)
super(DataLayer, self).save(
force_insert=force_insert, force_update=force_update, **kwargs
)
self.purge_gzip()
self.purge_old_versions()
def delete(self, **kwargs):
self.purge_gzip()
self.to_purgatory()
return super().delete(**kwargs)
def to_purgatory(self):
dest = Path(settings.UMAP_PURGATORY_ROOT)
dest.mkdir(parents=True, exist_ok=True)
src = Path(self.geojson.storage.location) / self.storage_root()
for version in self.versions:
name = version["name"]
(src / name).rename(dest / f"{self.map.pk}_{name}")
def upload_to(self):
root = self.storage_root()
name = "%s_%s.geojson" % (self.pk, int(time.time() * 1000))

View file

@ -267,6 +267,7 @@ UMAP_DEFAULT_FEATURES_HAVE_OWNERS = False
UMAP_HOME_FEED = "latest"
UMAP_IMPORTERS = {}
UMAP_HOST_INFOS = {}
UMAP_PURGATORY_ROOT = "/tmp/umappurgatory"
UMAP_READONLY = env("UMAP_READONLY", default=False)
UMAP_GZIP = True

View file

@ -1,3 +1,4 @@
import tempfile
from pathlib import Path
import pytest
@ -269,3 +270,26 @@ def test_anonymous_can_edit_in_inherit_mode_and_map_in_public_mode(
map.save()
fake_request.user = AnonymousUser()
assert datalayer.can_edit(fake_request)
def test_should_remove_all_versions_on_delete(map, settings):
settings.UMAP_PURGATORY_ROOT = tempfile.mkdtemp()
datalayer = DataLayerFactory(uuid="0f1161c0-c07f-4ba4-86c5-8d8981d8a813", old_id=17)
root = Path(datalayer.storage_root())
before = len(datalayer.geojson.storage.listdir(root)[1])
other = "123456_1440918637.geojson"
files = [
f"{datalayer.pk}_1440924889.geojson",
f"{datalayer.pk}_1440923687.geojson",
f"{datalayer.pk}_1440918637.geojson",
f"{datalayer.old_id}_1440918537.geojson",
other,
]
for path in files:
datalayer.geojson.storage.save(root / path, ContentFile("{}"))
datalayer.geojson.storage.save(root / f"{path}.gz", ContentFile("{}"))
assert len(datalayer.geojson.storage.listdir(root)[1]) == 10 + before
datalayer.delete()
found = datalayer.geojson.storage.listdir(root)[1]
assert found == [other, f"{other}.gz"]
assert len(list(Path(settings.UMAP_PURGATORY_ROOT).iterdir())) == 4 + before

View file

@ -0,0 +1,25 @@
import os
import tempfile
from pathlib import Path
from django.core.management import call_command
def test_purge_purgatory(settings):
settings.UMAP_PURGATORY_ROOT = tempfile.mkdtemp()
root = Path(settings.UMAP_PURGATORY_ROOT)
old = root / "old.json"
old.write_text("{}")
stat = old.stat()
os.utime(old, times=(stat.st_mtime - 31 * 86400, stat.st_mtime - 31 * 86400))
recent = root / "recent.json"
recent.write_text("{}")
stat = recent.stat()
os.utime(recent, times=(stat.st_mtime - 8 * 86400, stat.st_mtime - 8 * 86400))
now = root / "now.json"
now.write_text("{}")
assert {f.name for f in root.iterdir()} == {"old.json", "recent.json", "now.json"}
call_command("purge_purgatory")
assert {f.name for f in root.iterdir()} == {"recent.json", "now.json"}
call_command("purge_purgatory", "--days=7")
assert {f.name for f in root.iterdir()} == {"now.json"}