From 7bc8109732a426316c806b2b240213e93567c62f Mon Sep 17 00:00:00 2001 From: Eliot Berriot Date: Fri, 12 Jul 2019 15:06:39 +0200 Subject: [PATCH] See #432: tags acquisition (from audio files) --- api/funkwhale_api/music/metadata.py | 65 ++++++++++++++++++++++++- api/funkwhale_api/music/tasks.py | 7 ++- api/tests/music/sample.flac | Bin 91522 -> 91522 bytes api/tests/music/test.mp3 | Bin 297745 -> 297745 bytes api/tests/music/test.ogg | Bin 14858 -> 15918 bytes api/tests/music/test.opus | Bin 14583 -> 15643 bytes api/tests/music/test_metadata.py | 46 +++++++++++++++++ api/tests/music/test_models.py | 2 +- api/tests/music/test_tasks.py | 3 ++ api/tests/music/with_other_picture.mp3 | Bin 116211 -> 116339 bytes 10 files changed, 119 insertions(+), 4 deletions(-) diff --git a/api/funkwhale_api/music/metadata.py b/api/funkwhale_api/music/metadata.py index f0ea67b1a..77f85aef5 100644 --- a/api/funkwhale_api/music/metadata.py +++ b/api/funkwhale_api/music/metadata.py @@ -2,6 +2,7 @@ import base64 import datetime import logging import pendulum +import re import mutagen._util import mutagen.oggtheora @@ -144,6 +145,7 @@ CONF = { "mbid": {"field": "musicbrainz_trackid"}, "license": {}, "copyright": {}, + "genre": {}, }, }, "OggVorbis": { @@ -162,6 +164,7 @@ CONF = { "mbid": {"field": "musicbrainz_trackid"}, "license": {}, "copyright": {}, + "genre": {}, "pictures": { "field": "metadata_block_picture", "to_application": clean_ogg_pictures, @@ -184,6 +187,7 @@ CONF = { "mbid": {"field": "MusicBrainz Track Id"}, "license": {}, "copyright": {}, + "genre": {}, }, }, "MP3": { @@ -199,6 +203,7 @@ CONF = { "date": {"field": "TDRC"}, "musicbrainz_albumid": {"field": "MusicBrainz Album Id"}, "musicbrainz_artistid": {"field": "MusicBrainz Artist Id"}, + "genre": {"field": "TCON"}, "musicbrainz_albumartistid": {"field": "MusicBrainz Album Artist Id"}, "mbid": {"field": "UFID", "getter": get_mp3_recording_id}, "pictures": {}, @@ -220,6 +225,7 @@ CONF = { "musicbrainz_albumid": {}, "musicbrainz_artistid": {}, "musicbrainz_albumartistid": {}, + "genre": {}, "mbid": {"field": "musicbrainz_trackid"}, "test": {}, "pictures": {}, @@ -485,6 +491,61 @@ class PermissiveDateField(serializers.CharField): return None +TAG_REGEX = re.compile(r"^((\w+)([\d_]*))$") + + +def extract_tags_from_genre(string): + tags = [] + delimiter = "@@@@@" + for d in [" - ", ",", ";", "/"]: + # Replace common tags separators by a custom delimiter + string = string.replace(d, delimiter) + + # loop on the parts (splitting on our custom delimiter) + for tag in string.split(delimiter): + tag = tag.strip() + for d in ["-"]: + # preparation for replacement so that Pop-Rock becomes Pop Rock, then PopRock + # (step 1, step 2 happens below) + tag = tag.replace(d, " ") + if not tag: + continue + final_tag = "" + if not TAG_REGEX.match(tag.replace(" ", "")): + # the string contains some non words chars ($, €, etc.), right now + # we simply skip such tags + continue + # concatenate the parts and uppercase them so that 'pop rock' becomes 'PopRock' + if len(tag.split(" ")) == 1: + # we append the tag "as is", because it doesn't contain any space + tags.append(tag) + continue + for part in tag.split(" "): + # the tag contains space, there's work to do to have consistent case + # 'pop rock' -> 'PopRock' + # (step 2) + if not part: + continue + final_tag += part[0].upper() + part[1:] + if final_tag: + tags.append(final_tag) + return tags + + +class TagsField(serializers.CharField): + def get_value(self, data): + return data + + def to_internal_value(self, data): + try: + value = data.get("genre") or "" + except TagNotFound: + return [] + value = super().to_internal_value(str(value)) + + return extract_tags_from_genre(value) + + class MBIDField(serializers.UUIDField): def __init__(self, *args, **kwargs): kwargs.setdefault("allow_null", True) @@ -533,6 +594,7 @@ class TrackMetadataSerializer(serializers.Serializer): copyright = serializers.CharField(allow_blank=True, allow_null=True, required=False) license = serializers.CharField(allow_blank=True, allow_null=True, required=False) mbid = MBIDField() + tags = TagsField(allow_blank=True, allow_null=True, required=False) album = AlbumField() artists = ArtistField() @@ -544,6 +606,7 @@ class TrackMetadataSerializer(serializers.Serializer): "position", "disc_number", "mbid", + "tags", ] def validate(self, validated_data): @@ -553,7 +616,7 @@ class TrackMetadataSerializer(serializers.Serializer): v = validated_data[field] except KeyError: continue - if v in ["", None]: + if v in ["", None, []]: validated_data.pop(field) return validated_data diff --git a/api/funkwhale_api/music/tasks.py b/api/funkwhale_api/music/tasks.py index ff3cde440..7372f82c5 100644 --- a/api/funkwhale_api/music/tasks.py +++ b/api/funkwhale_api/music/tasks.py @@ -14,6 +14,7 @@ from requests.exceptions import RequestException from funkwhale_api.common import channels, preferences from funkwhale_api.federation import routes from funkwhale_api.federation import library as lb +from funkwhale_api.tags import models as tags_models from funkwhale_api.taskapp import celery from . import licenses @@ -541,10 +542,12 @@ def _get_track(data, attributed_to=None): if data.get("fdate"): defaults["creation_date"] = data.get("fdate") - track = get_best_candidate_or_create( + track, created = get_best_candidate_or_create( models.Track, query, defaults=defaults, sort_fields=["mbid", "fid"] - )[0] + ) + if created: + tags_models.add_tags(track, *data.get("tags", [])) return track diff --git a/api/tests/music/sample.flac b/api/tests/music/sample.flac index fe3ec6e4a270da0ff0a3f1ca677da4990ddb0ea5..a8aafa39239a199663c6673c678064826ecf965b 100644 GIT binary patch delta 493 zcmZoV&DwODb%L(&MqUO62A{;TG*fduQ!_n714HA94qg*yOHO8Db)KBdB&{XLz`)=i zKJ5eWME(c5)5+nadixK4T%hJwN1=PD$SjIoykU+pMinFG1$}3*2%!Y z(AmvDz&CJmDzoq8=}Zoj1sEm8gcukYd_5yvgKQmh67v*%^GkD5Qi~=>u;fhK;%&#x zz`zjf9OM}gV(XJwk|)N%zyP%-IK(j|G}tyjEiE%SGcgCr0nYxRej!1Twh`fz{TSsY z`!Gui@qrxS;}q&^8z}eq7z|+Su#M9r;)B3l&YlNT|0`-7B+c!v16+FDv#>R4J@f|UD226;Mr28Y;&6s6{sCeO%GB7X%x%#*|2D>`@hx&yCMcPJ$^Drb~52-U|?`@3~{wJGBB{vH89mRFa>D`>GTWrb#e`|H3Yi`qzP;$NTVC8 zvoJ`bW3Z>6t&@R)p|hKRfN$XBtIWz;;tUK7{z0DZo_>x#Xa;jLFfar=2YCjB*!m=v zoxzeQbS8i}MRo^Y|GU7+f4fTy2dE42^UREOZS`xfvK3f}Mjr143+l5=-&~85kIX zTzy;}gI%5dL;XU6B5fnWc^DWNLi|GFn-#f}nKm16uVUOR&idH|03cROe*gdg delta 195 zcmZ2i(^WDdfU9ipmQ2zA|NsBrI5AFxQDI`c&tz`KBmCS93=F}}L7o92wmyj^d6UmG zIx6!sFfh0{hPc`q85kPr8d&HWn({I*F!(q+x%$}pmKNt1q)wJ&7Mm={04rb%Y#Vj47 of(#4{&i(V{*o+;3=9ktt34+-u!v5+$)Y&%$dSp@8677JGTo`? zWnf_NaddL^vGpx2&M!#K<7Z%CaB&QAwKXy@G}1M&&^0vWW?*0lb`J6k2(k4^EXfmO zU|K9!c6Ig-^$Q7#w2cVoVPIeg@egtIaq$dx4z@J}@j*<#P+uq4AX`I_Jz)8e zAV+8KU|SPm1_lOS*U2i(zLS448&57|?wG8@5;A!zi-{4)3GS|bL9VvWIf=!^naPPc lAmhL~!DiT+fSd#}9>O)8%xIx8S-?Va5M!~;zgYen0|1)Ka#a8T delta 254 zcmbPT^}TR{tJzfpQ*YM)|NsAA?_W?_9Fmw`%*(*Q;FDOEW@@2lXryOoV4%Rjz`)JG zz~JH;>>T1B;^<>*II+NU;`<}~AkkpwAkTmhTc5;|yvdsx9Vb6z5>^8n;ppV*W9wU5 zoL`WdC&<9S5ajCP>KN?m>>uhE5)^405kC0~;~hgD1_lPOp&-+PZ4E(M!TgXQM`!O~ zTN981kepwruaj$#t>I)(=HSWanL8%WVK$!lUVL&Mi>)ZgcxV5B$RJO5j}Y77{M^+1 cywu67Sxf{#y1^!a&9pU{cusM%9OqwS0F=2(*Z=?k diff --git a/api/tests/music/test_metadata.py b/api/tests/music/test_metadata.py index 539fa49a2..121fef4b2 100644 --- a/api/tests/music/test_metadata.py +++ b/api/tests/music/test_metadata.py @@ -57,6 +57,7 @@ def test_can_get_metadata_all(): "musicbrainz_albumartistid": "013c8e5b-d72a-4cd3-8dee-6c64d6125823;5b4d7d2d-36df-4b38-95e3-a964234f520f", "license": "Dummy license: http://creativecommons.org/licenses/by-sa/4.0/", "copyright": "Someone", + "genre": "Classical", } assert data.all() == expected @@ -249,6 +250,7 @@ def test_metadata_fallback_ogg_theora(mocker): "mbid": uuid.UUID("f269d497-1cc0-4ae4-a0c4-157ec7d73fcb"), "license": "https://creativecommons.org/licenses/by-nc-nd/2.5/", "copyright": "Someone", + "tags": ["Funk"], }, ), ( @@ -281,6 +283,7 @@ def test_metadata_fallback_ogg_theora(mocker): "mbid": uuid.UUID("bd21ac48-46d8-4e78-925f-d9cc2a294656"), "license": "Dummy license: http://creativecommons.org/licenses/by-sa/4.0/", "copyright": "Someone", + "tags": ["Classical"], }, ), ( @@ -313,6 +316,7 @@ def test_metadata_fallback_ogg_theora(mocker): "mbid": uuid.UUID("bd21ac48-46d8-4e78-925f-d9cc2a294656"), "license": "Dummy license: http://creativecommons.org/licenses/by-sa/4.0/", "copyright": "Someone", + "tags": ["Classical"], }, ), ( @@ -336,6 +340,7 @@ def test_metadata_fallback_ogg_theora(mocker): } ], }, + "tags": ["Rock"], "position": 1, "disc_number": 1, "mbid": uuid.UUID("124d0150-8627-46bc-bc14-789a3bc960c8"), @@ -371,6 +376,7 @@ def test_metadata_fallback_ogg_theora(mocker): "mbid": uuid.UUID("30f3f33e-8d0c-4e69-8539-cbd701d18f28"), "license": "http://creativecommons.org/licenses/by-nc-sa/3.0/us/", "copyright": "2008 nin", + "tags": ["Industrial"], }, ), ], @@ -607,3 +613,43 @@ def test_artist_field_featuring(): value = field.get_value(data) assert field.to_internal_value(value) == expected + + +@pytest.mark.parametrize( + "genre, expected_tags", + [ + ("Pop", ["Pop"]), + ("pop", ["pop"]), + ("Pop-Rock", ["PopRock"]), + ("Pop - Rock", ["Pop", "Rock"]), + ("Soundtrack - Cute Anime", ["Soundtrack", "CuteAnime"]), + ("Pop, Rock", ["Pop", "Rock"]), + ("Chanson française", ["ChansonFrançaise"]), + ("Unhandled❤️", []), + ("tag with non-breaking spaces", []), + ], +) +def test_acquire_tags_from_genre(genre, expected_tags): + data = { + "title": "Track Title", + "artist": "Track Artist", + "album": "Track Album", + "genre": genre, + } + expected = { + "title": "Track Title", + "artists": [{"name": "Track Artist", "mbid": None}], + "album": { + "title": "Track Album", + "mbid": None, + "release_date": None, + "artists": [], + }, + "cover_data": None, + } + if expected_tags: + expected["tags"] = expected_tags + + serializer = metadata.TrackMetadataSerializer(data=metadata.FakeMetadata(data)) + assert serializer.is_valid(raise_exception=True) is True + assert serializer.validated_data == expected diff --git a/api/tests/music/test_models.py b/api/tests/music/test_models.py index 5aa29b3cc..344273673 100644 --- a/api/tests/music/test_models.py +++ b/api/tests/music/test_models.py @@ -448,7 +448,7 @@ def test_get_audio_data(factories): result = upload.get_audio_data() - assert result == {"duration": 1, "bitrate": 112000, "size": 14858} + assert result == {"duration": 1, "bitrate": 112000, "size": 15918} def test_library_queryset_with_follows(factories): diff --git a/api/tests/music/test_tasks.py b/api/tests/music/test_tasks.py index 78f4622ba..08beaa94e 100644 --- a/api/tests/music/test_tasks.py +++ b/api/tests/music/test_tasks.py @@ -18,6 +18,7 @@ DATA_DIR = os.path.dirname(os.path.abspath(__file__)) def test_can_create_track_from_file_metadata_no_mbid(db, mocker): + add_tags = mocker.patch("funkwhale_api.tags.models.add_tags") metadata = { "title": "Test track", "artists": [{"name": "Test artist"}], @@ -26,6 +27,7 @@ def test_can_create_track_from_file_metadata_no_mbid(db, mocker): "disc_number": 2, "license": "Hello world: http://creativecommons.org/licenses/by-sa/4.0/", "copyright": "2018 Someone", + "tags": ["Punk", "Rock"], } match_license = mocker.spy(licenses, "match") @@ -44,6 +46,7 @@ def test_can_create_track_from_file_metadata_no_mbid(db, mocker): assert track.artist.mbid is None assert track.artist.attributed_to is None match_license.assert_called_once_with(metadata["license"], metadata["copyright"]) + add_tags.assert_called_once_with(track, *metadata["tags"]) def test_can_create_track_from_file_metadata_attributed_to(factories, mocker): diff --git a/api/tests/music/with_other_picture.mp3 b/api/tests/music/with_other_picture.mp3 index 3118f067e37064f91f48f4265fac9745236169e0..e83b610b09126c26384fa48428584c8fe255f5bf 100644 GIT binary patch delta 1888 zcmey|&HlNEz24Ksn3;irfvrL=#52T*fq{XSfr0V=KPQG%hCGH81_p+Z09Qi>1_nu} za3(_@!xM%gh6096h8%``h7ypHAZKp|1_o9J1_mR26NV5+A14L|21BrpNQQidQU(PE zXNE+EJO*`!5(Wi^V1^Qge1-xB1qPRThC~Jh1~-OMhCGIBkWm5t!BC?N^$i(9oc;aa zZbDM!;ur!KHefJkFkk>VJtWdK2rg{IV8CF&U^dOvR({{U|EpY^FdkzTtNoo^?Wecbi7I;A|l{H7{cJs5X2D4z`*eT zpAkbn+>sD9vQS4lGXybsG6XPyy!rp14?`kD2}2&pLB6go3=9nNU}Y{0nGER+nG7Wi zi3~Xm3JksssSGI$nGA^_g~7qDaD^cKWejNyrVJJgdJKjPMqmT{ z$`WjqFGDFqF+(OpGJ_LC5kn%_mMR7X21kY*h9rhkhFk`PdWH~&N`?YZQegQ1FO{JH zVTLEMW`NT(D1tz?f&7!tkj9V(wigr^IS3>DF^sIo;VyV0WFaq0c!jQrMvJd2n z6b1u^6b4-e0|rnim@%X=B!iu8$&kum$Y6{J`*s4ZfG0KvM+S5kfzv;#lRz;-p^G~4 zx(GRG)q~QOJ3|pT*?>x9Sb|ArFlPXTwGl%SLn1>OIBldrla4Wi8G|uHGB~cx87vqK z7<9qO3KS})44{OT2G#=#DR}vU-A(nNoDa&A$c_TJDFmEjlNqv+60sqJ1w#@;GJ^$! zC4(8*R!~%e(y1QW=aG5*bVw5*a`Rgdsy3xIzFG6OI9%&J2v3 z>KK?97@A))Z-2?mXtH&?#vaDgoa`Zv?oO$BDbowLF-mbcW#&C8D#*;qF9GpIlp^y> m6`T|E)JqhCOY#d8ToM)BO7lP!7Xt_w85o#>)X6e2u>t@Y)_NQO delta 1056 zcmey|!~VIOz24Ksn1z9XfvrL=#52SQ#A0Azc1q1lVF(FuHDq95;05tB^A0a5D#*;q zFJTA?a`t9mU|?lnU^db>VF+>babjR#kYHe7j?6Dra8ArqFHs0C$uCfFNmOtv&C6y8 z3GfewYB1C{WC(Eya%Ny)-~s73FsL`vH89dOFlPvH_77lSVBln6U=GgDP0i0sWe5of z1sM$z@=DB2%}dE=2#JV@U|?Y2W?*0r@ehh*FfuSO1347ra14hAgH6N`h8O`74t5Tz z_Y4SO@JTGmV+iqebzxv&;Adc9cF9c7EJ@5!@J&t0Ok@ZN4t9k~_#~F4nOf)>8tEAt z7{DAO1aeHUzY&8IawtJ;5oTau&M(SL&&*5AsZ32Qf;t!?DZ;?OT<`1{;^^b=?icFo zf!VRBB(u1r7@N`33=GV^rNx=aPDP2Cc~uIIIZ36t3L%vRsSK$F zFf){~Dhe)1EGaE!$WKeltWVBN%z>#^!>T$cH77N(I90(pzcjC;sFEQf9H!m~;RG%~Z)wKOubOtZ8|F)=iPhmj4$6oet*@B+ET zGTAICB_$mb1%v-EdUvoY;I&?l$4mJo0MX1S+8qsW}K{>Xl`Mkn`UlkY-(bjmY8Ol z3JWG&riBzGCTD|8Gqgxbwy-oa&`mZ=HP$t@Fan2Lif*b=qDi8mp<$YVnF)hqfTuG9 zTS^@R19N6>VtT56L0&oogL8gaYBMwYc4l@))venbb~3JDn%=OJF?qYj9>&w00Lt1E AbN~PV