Importer updates: watch directories, handle metadata updates

2020-05-07 09:55:29 +02:00 · 2020-05-07 09:55:29 +02:00 · 752c993e8e
commit 752c993e8e
parent 2b5a2b39ac
16 changed files with 1005 additions and 120 deletions
--- a/api/funkwhale_api/common/utils.py
+++ b/api/funkwhale_api/common/utils.py
@ -1,4 +1,5 @@
 import datetime
+import hashlib

 from django.core.files.base import ContentFile
 from django.http import request
@ -458,3 +459,19 @@ def monkey_patch_request_build_absolute_uri():

    request.HttpRequest.scheme = property(scheme)
    request.HttpRequest.get_host = get_host
+
+
+def get_file_hash(file, algo=None, chunk_size=None, full_read=False):
+    algo = algo or settings.HASHING_ALGORITHM
+    chunk_size = chunk_size or settings.HASHING_CHUNK_SIZE
+    handler = getattr(hashlib, algo)
+    hash = handler()
+    file.seek(0)
+    if full_read:
+        for byte_block in iter(lambda: file.read(chunk_size), b""):
+            hash.update(byte_block)
+    else:
+        # sometimes, it's useful to only hash the beginning of the file, e.g
+        # to avoid a lot of I/O when crawling large libraries
+        hash.update(file.read(chunk_size))
+    return "{}:{}".format(algo, hash.hexdigest())
--- a/api/funkwhale_api/music/management/commands/fix_uploads.py
+++ b/api/funkwhale_api/music/management/commands/fix_uploads.py
@ -2,6 +2,7 @@ from django.core.management.base import BaseCommand
 from django.db import transaction
 from django.db.models import Q

+from funkwhale_api.common import utils as common_utils
 from funkwhale_api.music import models, utils


@ -17,9 +18,9 @@ class Command(BaseCommand):
            help="Do not execute anything",
        )
        parser.add_argument(
-            "--mimetypes",
+            "--mimetype",
            action="store_true",
-            dest="mimetypes",
+            dest="mimetype",
            default=True,
            help="Check and fix mimetypes",
        )
@ -37,16 +38,33 @@ class Command(BaseCommand):
            default=False,
            help="Check and fix file size, can be really slow because it needs to access files",
        )
+        parser.add_argument(
+            "--checksum",
+            action="store_true",
+            dest="checksum",
+            default=False,
+            help="Check and fix file size, can be really slow because it needs to access files",
+        )
+        parser.add_argument(
+            "--batch-size",
+            "-s",
+            dest="batch_size",
+            default=1000,
+            type=int,
+            help="Size of each updated batch",
+        )

    def handle(self, *args, **options):
        if options["dry_run"]:
            self.stdout.write("Dry-run on, will not commit anything")
-        if options["mimetypes"]:
+        if options["mimetype"]:
            self.fix_mimetypes(**options)
        if options["data"]:
            self.fix_file_data(**options)
        if options["size"]:
            self.fix_file_size(**options)
+        if options["checksum"]:
+            self.fix_file_checksum(**options)

    @transaction.atomic
    def fix_mimetypes(self, dry_run, **kwargs):
@ -54,11 +72,12 @@ class Command(BaseCommand):
        matching = models.Upload.objects.filter(
            Q(source__startswith="file://") | Q(source__startswith="upload://")
        ).exclude(mimetype__startswith="audio/")
+        total = matching.count()
        self.stdout.write(
-            "[mimetypes] {} entries found with bad or no mimetype".format(
-                matching.count()
-            )
+            "[mimetypes] {} entries found with bad or no mimetype".format(total)
        )
+        if not total:
+            return
        for extension, mimetype in utils.EXTENSION_TO_MIMETYPE.items():
            qs = matching.filter(source__endswith=".{}".format(extension))
            self.stdout.write(
@ -81,24 +100,36 @@ class Command(BaseCommand):
        )
        if dry_run:
            return
-        for i, upload in enumerate(matching.only("audio_file")):
-            self.stdout.write(
-                "[bitrate/length] {}/{} fixing file #{}".format(i + 1, total, upload.pk)
-            )

-            try:
-                audio_file = upload.get_audio_file()
-                if audio_file:
+        chunks = common_utils.chunk_queryset(
+            matching.only("id", "audio_file", "source"), kwargs["batch_size"]
+        )
+        handled = 0
+        for chunk in chunks:
+            updated = []
+            for upload in chunk:
+                handled += 1
+                self.stdout.write(
+                    "[bitrate/length] {}/{} fixing file #{}".format(
+                        handled, total, upload.pk
+                    )
+                )
+
+                try:
+                    audio_file = upload.get_audio_file()
                    data = utils.get_audio_file_data(audio_file)
                    upload.bitrate = data["bitrate"]
                    upload.duration = data["length"]
-                    upload.save(update_fields=["duration", "bitrate"])
+                except Exception as e:
+                    self.stderr.write(
+                        "[bitrate/length] error with file #{}: {}".format(
+                            upload.pk, str(e)
+                        )
+                    )
                else:
-                    self.stderr.write("[bitrate/length] no file found")
-            except Exception as e:
-                self.stderr.write(
-                    "[bitrate/length] error with file #{}: {}".format(upload.pk, str(e))
-                )
+                    updated.append(upload)
+
+            models.Upload.objects.bulk_update(updated, ["bitrate", "duration"])

    def fix_file_size(self, dry_run, **kwargs):
        self.stdout.write("Fixing missing size...")
@ -107,15 +138,64 @@ class Command(BaseCommand):
        self.stdout.write("[size] {} entries found with missing values".format(total))
        if dry_run:
            return
-        for i, upload in enumerate(matching.only("size")):
-            self.stdout.write(
-                "[size] {}/{} fixing file #{}".format(i + 1, total, upload.pk)
-            )

-            try:
-                upload.size = upload.get_file_size()
-                upload.save(update_fields=["size"])
-            except Exception as e:
-                self.stderr.write(
-                    "[size] error with file #{}: {}".format(upload.pk, str(e))
+        chunks = common_utils.chunk_queryset(
+            matching.only("id", "audio_file", "source"), kwargs["batch_size"]
+        )
+        handled = 0
+        for chunk in chunks:
+            updated = []
+            for upload in chunk:
+                handled += 1
+
+                self.stdout.write(
+                    "[size] {}/{} fixing file #{}".format(handled, total, upload.pk)
                )
+
+                try:
+                    upload.size = upload.get_file_size()
+                except Exception as e:
+                    self.stderr.write(
+                        "[size] error with file #{}: {}".format(upload.pk, str(e))
+                    )
+                else:
+                    updated.append(upload)
+
+            models.Upload.objects.bulk_update(updated, ["size"])
+
+    def fix_file_checksum(self, dry_run, **kwargs):
+        self.stdout.write("Fixing missing checksums...")
+        matching = models.Upload.objects.filter(
+            Q(checksum=None)
+            & (Q(audio_file__isnull=False) | Q(source__startswith="file://"))
+        )
+        total = matching.count()
+        self.stdout.write(
+            "[checksum] {} entries found with missing values".format(total)
+        )
+        if dry_run:
+            return
+        chunks = common_utils.chunk_queryset(
+            matching.only("id", "audio_file", "source"), kwargs["batch_size"]
+        )
+        handled = 0
+        for chunk in chunks:
+            updated = []
+            for upload in chunk:
+                handled += 1
+                self.stdout.write(
+                    "[checksum] {}/{} fixing file #{}".format(handled, total, upload.pk)
+                )
+
+                try:
+                    upload.checksum = common_utils.get_file_hash(
+                        upload.get_audio_file()
+                    )
+                except Exception as e:
+                    self.stderr.write(
+                        "[checksum] error with file #{}: {}".format(upload.pk, str(e))
+                    )
+                else:
+                    updated.append(upload)
+
+            models.Upload.objects.bulk_update(updated, ["checksum"])
--- a/api/funkwhale_api/music/management/commands/import_files.py
+++ b/api/funkwhale_api/music/management/commands/import_files.py
@ -1,17 +1,29 @@
+import collections
+import datetime
 import itertools
 import os
-import urllib.parse
+import queue
+import threading
 import time
+import urllib.parse
+
+import watchdog.events
+import watchdog.observers

 from django.conf import settings
 from django.core.files import File
+from django.core.management import call_command
 from django.core.management.base import BaseCommand, CommandError
+from django.db.models import Q
 from django.utils import timezone

+from rest_framework import serializers
+
+from funkwhale_api.common import utils as common_utils
 from funkwhale_api.music import models, tasks, utils


-def crawl_dir(dir, extensions, recursive=True):
+def crawl_dir(dir, extensions, recursive=True, ignored=[]):
    if os.path.isfile(dir):
        yield dir
        return
@ -20,9 +32,12 @@ def crawl_dir(dir, extensions, recursive=True):
            if entry.is_file():
                for e in extensions:
                    if entry.name.lower().endswith(".{}".format(e.lower())):
-                        yield entry.path
+                        if entry.path not in ignored:
+                            yield entry.path
            elif recursive and entry.is_dir():
-                yield from crawl_dir(entry, extensions, recursive=recursive)
+                yield from crawl_dir(
+                    entry, extensions, recursive=recursive, ignored=ignored
+                )


 def batch(iterable, n=1):
@ -116,6 +131,17 @@ class Command(BaseCommand):
                "of overhead on your server and on servers you are federating with."
            ),
        )
+        parser.add_argument(
+            "--watch",
+            action="store_true",
+            dest="watch",
+            default=False,
+            help=(
+                "Start the command in watch mode. Instead of running a full import, "
+                "and exit, watch the given path and import new files, remove deleted "
+                "files, and update metadata corresponding to updated files."
+            ),
+        )
        parser.add_argument("-e", "--extension", nargs="+")

        parser.add_argument(
@ -128,6 +154,15 @@ class Command(BaseCommand):
                "This causes some overhead, so it's disabled by default."
            ),
        )
+        parser.add_argument(
+            "--prune",
+            action="store_true",
+            dest="prune",
+            default=False,
+            help=(
+                "Once the import is completed, prune tracks, ablums and artists that aren't linked to any upload."
+            ),
+        )

        parser.add_argument(
            "--reference",
@ -157,6 +192,8 @@ class Command(BaseCommand):
        )

    def handle(self, *args, **options):
+        # handle relative directories
+        options["path"] = [os.path.abspath(path) for path in options["path"]]
        self.is_confirmed = False
        try:
            library = models.Library.objects.select_related("actor__user").get(
@ -182,22 +219,12 @@ class Command(BaseCommand):
                    )
                if p and not import_path.startswith(p):
                    raise CommandError(
-                        "Importing in-place only works if importing"
+                        "Importing in-place only works if importing "
                        "from {} (MUSIC_DIRECTORY_PATH), as this directory"
                        "needs to be accessible by the webserver."
                        "Culprit: {}".format(p, import_path)
                    )

-        extensions = options.get("extension") or utils.SUPPORTED_EXTENSIONS
-        crawler = itertools.chain(
-            *[
-                crawl_dir(p, extensions=extensions, recursive=options["recursive"])
-                for p in options["path"]
-            ]
-        )
-        errors = []
-        total = 0
-        start_time = time.time()
        reference = options["reference"] or "cli-{}".format(timezone.now().isoformat())

        import_url = "{}://{}/library/{}/upload?{}"
@ -212,8 +239,62 @@ class Command(BaseCommand):
                reference, import_url
            )
        )
+        extensions = options.get("extension") or utils.SUPPORTED_EXTENSIONS
+        if options["watch"]:
+            if len(options["path"]) > 1:
+                raise CommandError("Watch only work with a single directory")
+
+            return self.setup_watcher(
+                extensions=extensions,
+                path=options["path"][0],
+                reference=reference,
+                library=library,
+                in_place=options["in_place"],
+                prune=options["prune"],
+                recursive=options["recursive"],
+                replace=options["replace"],
+                dispatch_outbox=options["outbox"],
+                broadcast=options["broadcast"],
+            )
+
+        update = True
+        checked_paths = set()
+        if options["in_place"] and update:
+            self.stdout.write("Checking existing files for updates…")
+            message = (
+                "Are you sure you want to do this?\n\n"
+                "Type 'yes' to continue, or 'no' to skip checking for updates in "
+                "already imported files: "
+            )
+            if options["interactive"] and input("".join(message)) != "yes":
+                pass
+            else:
+                checked_paths = check_updates(
+                    stdout=self.stdout,
+                    paths=options["path"],
+                    extensions=extensions,
+                    library=library,
+                    batch_size=options["batch_size"],
+                )
+                self.stdout.write("Existing files checked, moving on to next step!")
+
+        crawler = itertools.chain(
+            *[
+                crawl_dir(
+                    p,
+                    extensions=extensions,
+                    recursive=options["recursive"],
+                    ignored=checked_paths,
+                )
+                for p in options["path"]
+            ]
+        )
+        errors = []
+        total = 0
+        start_time = time.time()
        batch_start = None
        batch_duration = None
+        self.stdout.write("Starting import of new files…")
        for i, entries in enumerate(batch(crawler, options["batch_size"])):
            total += len(entries)
            batch_start = time.time()
@ -225,7 +306,7 @@ class Command(BaseCommand):
            if entries:
                self.stdout.write(
                    "Handling batch {} ({} items){}".format(
-                        i + 1, options["batch_size"], time_stats,
+                        i + 1, len(entries), time_stats,
                    )
                )
                batch_errors = self.handle_batch(
@ -240,9 +321,9 @@ class Command(BaseCommand):

            batch_duration = time.time() - batch_start

-        message = "Successfully imported {} tracks in {}s"
+        message = "Successfully imported {} new tracks in {}s"
        if options["async_"]:
-            message = "Successfully launched import for {} tracks in {}s"
+            message = "Successfully launched import for {} new tracks in {}s"

        self.stdout.write(
            message.format(total - len(errors), int(time.time() - start_time))
@ -259,6 +340,12 @@ class Command(BaseCommand):
            )
        )

+        if options["prune"]:
+            self.stdout.write(
+                "Pruning dangling tracks, albums and artists from library…"
+            )
+            prune()
+
    def handle_batch(self, library, paths, batch, reference, options):
        matching = []
        for m in paths:
@ -362,15 +449,15 @@ class Command(BaseCommand):
                    message.format(batch=batch, path=path, i=i + 1, total=len(paths))
                )
            try:
-                self.create_upload(
-                    path,
-                    reference,
-                    library,
-                    async_,
-                    options["replace"],
-                    options["in_place"],
-                    options["outbox"],
-                    options["broadcast"],
+                create_upload(
+                    path=path,
+                    reference=reference,
+                    library=library,
+                    async_=async_,
+                    replace=options["replace"],
+                    in_place=options["in_place"],
+                    dispatch_outbox=options["outbox"],
+                    broadcast=options["broadcast"],
                )
            except Exception as e:
                if options["exit_on_failure"]:
@ -382,34 +469,311 @@ class Command(BaseCommand):
                errors.append((path, "{} {}".format(e.__class__.__name__, e)))
        return errors

-    def create_upload(
-        self,
-        path,
-        reference,
-        library,
-        async_,
-        replace,
-        in_place,
-        dispatch_outbox,
-        broadcast,
-    ):
-        import_handler = tasks.process_upload.delay if async_ else tasks.process_upload
-        upload = models.Upload(library=library, import_reference=reference)
-        upload.source = "file://" + path
-        upload.import_metadata = {
-            "funkwhale": {
-                "config": {
-                    "replace": replace,
-                    "dispatch_outbox": dispatch_outbox,
-                    "broadcast": broadcast,
-                }
+    def setup_watcher(self, path, extensions, recursive, **kwargs):
+        watchdog_queue = queue.Queue()
+        # Set up a worker thread to process database load
+        worker = threading.Thread(
+            target=process_load_queue(self.stdout, **kwargs), args=(watchdog_queue,),
+        )
+        worker.setDaemon(True)
+        worker.start()
+
+        # setup watchdog to monitor directory for trigger files
+        patterns = ["*.{}".format(e) for e in extensions]
+        event_handler = Watcher(
+            stdout=self.stdout, queue=watchdog_queue, patterns=patterns,
+        )
+        observer = watchdog.observers.Observer()
+        observer.schedule(event_handler, path, recursive=recursive)
+        observer.start()
+
+        try:
+            while True:
+                self.stdout.write(
+                    "Watching for changes at {}…".format(path), ending="\r"
+                )
+                time.sleep(10)
+                if kwargs["prune"] and GLOBAL["need_pruning"]:
+                    self.stdout.write("Some files were deleted, pruning library…")
+                    prune()
+                    GLOBAL["need_pruning"] = False
+        except KeyboardInterrupt:
+            self.stdout.write("Exiting…")
+            observer.stop()
+
+        observer.join()
+
+
+GLOBAL = {"need_pruning": False}
+
+
+def prune():
+    call_command(
+        "prune_library",
+        dry_run=False,
+        prune_artists=True,
+        prune_albums=True,
+        prune_tracks=True,
+    )
+
+
+def create_upload(
+    path, reference, library, async_, replace, in_place, dispatch_outbox, broadcast,
+):
+    import_handler = tasks.process_upload.delay if async_ else tasks.process_upload
+    upload = models.Upload(library=library, import_reference=reference)
+    upload.source = "file://" + path
+    upload.import_metadata = {
+        "funkwhale": {
+            "config": {
+                "replace": replace,
+                "dispatch_outbox": dispatch_outbox,
+                "broadcast": broadcast,
            }
        }
-        if not in_place:
-            name = os.path.basename(path)
-            with open(path, "rb") as f:
-                upload.audio_file.save(name, File(f), save=False)
+    }
+    if not in_place:
+        name = os.path.basename(path)
+        with open(path, "rb") as f:
+            upload.audio_file.save(name, File(f), save=False)

-        upload.save()
+    upload.save()

-        import_handler(upload_id=upload.pk)
+    import_handler(upload_id=upload.pk)
+
+
+def process_load_queue(stdout, **kwargs):
+    def inner(q):
+        # we batch events, to avoid calling same methods multiple times if a file is modified
+        # a lot in a really short time
+        flush_delay = 2
+        batched_events = collections.OrderedDict()
+        while True:
+            while True:
+                if not q.empty():
+                    event = q.get()
+                    batched_events[event["path"]] = event
+                else:
+                    break
+            for path, event in batched_events.copy().items():
+                if time.time() - event["time"] <= flush_delay:
+                    continue
+                now = datetime.datetime.utcnow()
+                stdout.write(
+                    "{} -- Processing {}:{}...\n".format(
+                        now.strftime("%Y/%m/%d %H:%M:%S"), event["type"], event["path"]
+                    )
+                )
+                del batched_events[path]
+                handle_event(event, stdout=stdout, **kwargs)
+            time.sleep(1)
+
+    return inner
+
+
+class Watcher(watchdog.events.PatternMatchingEventHandler):
+    def __init__(self, stdout, queue, patterns):
+        self.stdout = stdout
+        self.queue = queue
+        super().__init__(patterns=patterns)
+
+    def enqueue(self, event):
+        e = {
+            "is_directory": event.is_directory,
+            "type": event.event_type,
+            "path": event.src_path,
+            "src_path": event.src_path,
+            "dest_path": getattr(event, "dest_path", None),
+            "time": time.time(),
+        }
+        self.queue.put(e)
+
+    def on_moved(self, event):
+        self.enqueue(event)
+
+    def on_created(self, event):
+        self.enqueue(event)
+
+    def on_deleted(self, event):
+        self.enqueue(event)
+
+    def on_modified(self, event):
+        self.enqueue(event)
+
+
+def handle_event(event, stdout, **kwargs):
+    handlers = {
+        "modified": handle_modified,
+        "created": handle_created,
+        "moved": handle_moved,
+        "deleted": handle_deleted,
+    }
+    handlers[event["type"]](event=event, stdout=stdout, **kwargs)
+
+
+def handle_modified(event, stdout, library, in_place, **kwargs):
+    existing_candidates = library.uploads.filter(import_status="finished")
+    with open(event["path"], "rb") as f:
+        checksum = common_utils.get_file_hash(f)
+
+    existing = existing_candidates.filter(checksum=checksum).first()
+    if existing:
+        # found an existing file with same checksum, nothing to do
+        stdout.write("  File already imported and metadata is up-to-date")
+        return
+
+    to_update = None
+    if in_place:
+        source = "file://{}".format(event["path"])
+        to_update = (
+            existing_candidates.in_place()
+            .filter(source=source)
+            .select_related(
+                "track__attributed_to", "track__artist", "track__album__artist",
+            )
+            .first()
+        )
+        if to_update:
+            if (
+                to_update.track.attributed_to
+                and to_update.track.attributed_to != library.actor
+            ):
+                stdout.write(
+                    "  Cannot update track metadata, track belongs to someone else".format(
+                        to_update.pk
+                    )
+                )
+                return
+            else:
+                stdout.write(
+                    "  Updating existing file #{} with new metadata…".format(
+                        to_update.pk
+                    )
+                )
+                audio_metadata = to_update.get_metadata()
+                try:
+                    tasks.update_track_metadata(audio_metadata, to_update.track)
+                except serializers.ValidationError as e:
+                    stdout.write("  Invalid metadata: {}".format(e))
+                else:
+                    to_update.checksum = checksum
+                    to_update.save(update_fields=["checksum"])
+                return
+
+    stdout.write("  Launching import for new file")
+    create_upload(
+        path=event["path"],
+        reference=kwargs["reference"],
+        library=library,
+        async_=False,
+        replace=kwargs["replace"],
+        in_place=in_place,
+        dispatch_outbox=kwargs["dispatch_outbox"],
+        broadcast=kwargs["broadcast"],
+    )
+
+
+def handle_created(event, stdout, **kwargs):
+    """
+    Created is essentially an alias for modified, because for instance when copying a file in the watched directory,
+    a created event will be fired on the initial touch, then many modified event (as the file is written).
+    """
+    return handle_modified(event, stdout, **kwargs)
+
+
+def handle_moved(event, stdout, library, in_place, **kwargs):
+    if not in_place:
+        return
+
+    old_source = "file://{}".format(event["src_path"])
+    new_source = "file://{}".format(event["dest_path"])
+    existing_candidates = library.uploads.filter(import_status="finished")
+    existing_candidates = existing_candidates.in_place().filter(source=old_source)
+    existing = existing_candidates.first()
+    if existing:
+        stdout.write("  Updating path of existing file #{}".format(existing.pk))
+        existing.source = new_source
+        existing.save(update_fields=["source"])
+
+
+def handle_deleted(event, stdout, library, in_place, **kwargs):
+    if not in_place:
+        return
+    source = "file://{}".format(event["path"])
+    existing_candidates = library.uploads.filter(import_status="finished")
+    existing_candidates = existing_candidates.in_place().filter(source=source)
+    if existing_candidates.count():
+        stdout.write("  Removing file from DB")
+        existing_candidates.delete()
+        GLOBAL["need_pruning"] = True
+
+
+def check_updates(stdout, library, extensions, paths, batch_size):
+    existing = (
+        library.uploads.in_place()
+        .filter(import_status="finished")
+        .exclude(checksum=None)
+        .select_related("library", "track")
+    )
+    queries = []
+    checked_paths = set()
+    for path in paths:
+        for ext in extensions:
+            queries.append(
+                Q(source__startswith="file://{}".format(path))
+                & Q(source__endswith=".{}".format(ext))
+            )
+    query, remainder = queries[0], queries[1:]
+    for q in remainder:
+        query = q | query
+    existing = existing.filter(query)
+    total = existing.count()
+    stdout.write("Found {} files to check in database!".format(total))
+    uploads = existing.order_by("source")
+    for i, rows in enumerate(batch(uploads.iterator(), batch_size)):
+        stdout.write("Handling batch {} ({} items)".format(i + 1, len(rows),))
+
+        for upload in rows:
+
+            check_upload(stdout, upload)
+            checked_paths.add(upload.source.replace("file://", "", 1))
+
+    return checked_paths
+
+
+def check_upload(stdout, upload):
+    try:
+        audio_file = upload.get_audio_file()
+    except FileNotFoundError:
+        stdout.write(
+            "  Removing file #{} missing from disk at {}".format(
+                upload.pk, upload.source
+            )
+        )
+        return upload.delete()
+
+    checksum = common_utils.get_file_hash(audio_file)
+    if upload.checksum != checksum:
+        stdout.write(
+            "  File #{} at {} was modified, updating metadata…".format(
+                upload.pk, upload.source
+            )
+        )
+        if upload.library.actor_id != upload.track.attributed_to_id:
+            stdout.write(
+                "  Cannot update track metadata, track belongs to someone else".format(
+                    upload.pk
+                )
+            )
+        else:
+            track = models.Track.objects.select_related("artist", "album__artist").get(
+                pk=upload.track_id
+            )
+            try:
+                tasks.update_track_metadata(upload.get_metadata(), track)
+            except serializers.ValidationError as e:
+                stdout.write("  Invalid metadata: {}".format(e))
+                return
+            else:
+                upload.checksum = checksum
+                upload.save(update_fields=["checksum"])
--- a/api/funkwhale_api/music/migrations/0052_auto_20200505_0810.py
+++ b/api/funkwhale_api/music/migrations/0052_auto_20200505_0810.py
@ -0,0 +1,23 @@
+# Generated by Django 3.0.4 on 2020-05-05 08:10
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('music', '0051_auto_20200319_1249'),
+    ]
+
+    operations = [
+        migrations.AddField(
+            model_name='upload',
+            name='checksum',
+            field=models.CharField(blank=True, db_index=True, max_length=100, null=True),
+        ),
+        migrations.AlterField(
+            model_name='uploadversion',
+            name='mimetype',
+            field=models.CharField(choices=[('audio/mp3', 'mp3'), ('audio/mpeg3', 'mp3'), ('audio/x-mp3', 'mp3'), ('audio/mpeg', 'mp3'), ('video/ogg', 'ogg'), ('audio/ogg', 'ogg'), ('audio/opus', 'opus'), ('audio/x-m4a', 'aac'), ('audio/x-m4a', 'm4a'), ('audio/x-flac', 'flac'), ('audio/flac', 'flac')], max_length=50),
+        ),
+    ]
--- a/api/funkwhale_api/music/models.py
+++ b/api/funkwhale_api/music/models.py
@ -655,6 +655,14 @@ class Track(APIModelMixin):


 class UploadQuerySet(common_models.NullsLastQuerySet):
+    def in_place(self, include=True):
+        query = models.Q(source__startswith="file://") & (
+            models.Q(audio_file="") | models.Q(audio_file=None)
+        )
+        if not include:
+            query = ~query
+        return self.filter(query)
+
    def playable_by(self, actor, include=True):
        libraries = Library.objects.viewable_by(actor)

@ -754,6 +762,9 @@ class Upload(models.Model):
    )
    downloads_count = models.PositiveIntegerField(default=0)

+    # stores checksums such as `sha256:e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855`
+    checksum = models.CharField(max_length=100, db_index=True, null=True, blank=True)
+
    objects = UploadQuerySet.as_manager()

    @property
@ -833,7 +844,7 @@ class Upload(models.Model):
    def get_audio_file(self):
        if self.audio_file:
            return self.audio_file.open()
-        if self.source.startswith("file://"):
+        if self.source and self.source.startswith("file://"):
            return open(self.source.replace("file://", "", 1), "rb")

    def get_audio_data(self):
@ -866,6 +877,15 @@ class Upload(models.Model):
                self.mimetype = mimetypes.guess_type(self.source)[0]
        if not self.size and self.audio_file:
            self.size = self.audio_file.size
+        if not self.checksum:
+            try:
+                audio_file = self.get_audio_file()
+            except FileNotFoundError:
+                pass
+            else:
+                if audio_file:
+                    self.checksum = common_utils.get_file_hash(audio_file)
+
        if not self.pk and not self.fid and self.library.actor.get_user():
            self.fid = self.get_federation_id()
        return super().save(**kwargs)
--- a/api/funkwhale_api/music/tasks.py
+++ b/api/funkwhale_api/music/tasks.py
@ -851,3 +851,71 @@ def update_library_entity(obj, data):
    obj.save(update_fields=list(data.keys()))

    return obj
+
+
+UPDATE_CONFIG = {
+    "track": {
+        "position": {},
+        "title": {},
+        "mbid": {},
+        "disc_number": {},
+        "copyright": {},
+        "license": {
+            "getter": lambda data, field: licenses.match(
+                data.get("license"), data.get("copyright")
+            )
+        },
+    },
+    "album": {"title": {}, "mbid": {}, "release_date": {}},
+    "artist": {"name": {}, "mbid": {}},
+    "album_artist": {"name": {}, "mbid": {}},
+}
+
+
+@transaction.atomic
+def update_track_metadata(audio_metadata, track):
+    # XXX: implement this to support updating metadata when an imported files
+    # is updated by an outside tool (e.g beets).
+    serializer = metadata.TrackMetadataSerializer(data=audio_metadata)
+    serializer.is_valid(raise_exception=True)
+    new_data = serializer.validated_data
+
+    to_update = [
+        ("track", track, lambda data: data),
+        ("album", track.album, lambda data: data["album"]),
+        ("artist", track.artist, lambda data: data["artists"][0]),
+        (
+            "album_artist",
+            track.album.artist if track.album else None,
+            lambda data: data["album"]["artists"][0],
+        ),
+    ]
+    for id, obj, data_getter in to_update:
+        if not obj:
+            continue
+        obj_updated_fields = []
+        try:
+            obj_data = data_getter(new_data)
+        except IndexError:
+            continue
+        for field, config in UPDATE_CONFIG[id].items():
+            getter = config.get(
+                "getter", lambda data, field: data[config.get("field", field)]
+            )
+            try:
+                new_value = getter(obj_data, field)
+            except KeyError:
+                continue
+            old_value = getattr(obj, field)
+            if new_value == old_value:
+                continue
+            obj_updated_fields.append(field)
+            setattr(obj, field, new_value)
+
+        if obj_updated_fields:
+            obj.save(update_fields=obj_updated_fields)
+
+    if track.album and "album" in new_data and new_data["album"].get("cover_data"):
+        common_utils.attach_file(
+            track.album, "attachment_cover", new_data["album"].get("cover_data")
+        )