From 62c9028212f8c96d5faa99ea0061a011a4237249 Mon Sep 17 00:00:00 2001 From: Angel Rey Date: Mon, 21 Sep 2020 11:50:26 -0500 Subject: [PATCH] Improved tags --- archivebox.egg-info/requires.txt | 1 + archivebox/core/admin.py | 12 ++- .../migrations/0006_auto_20200915_2006.py | 89 ++++++++++++++++++ archivebox/core/models.py | 11 ++- archivebox/core/settings.py | 1 + archivebox/index/__init__.py | 11 ++- archivebox/index/schema.py | 3 +- archivebox/index/sql.py | 9 +- setup.py | 1 + tests/tags_migration/index.sqlite3 | Bin 0 -> 167936 bytes tests/test_init.py | 44 ++++++++- 11 files changed, 172 insertions(+), 10 deletions(-) create mode 100644 archivebox/core/migrations/0006_auto_20200915_2006.py create mode 100755 tests/tags_migration/index.sqlite3 diff --git a/archivebox.egg-info/requires.txt b/archivebox.egg-info/requires.txt index 71dc253d..ca279875 100644 --- a/archivebox.egg-info/requires.txt +++ b/archivebox.egg-info/requires.txt @@ -4,6 +4,7 @@ mypy-extensions==0.4.3 base32-crockford==0.3.0 django==3.0.8 django-extensions==3.0.3 +django-taggit==1.3.0 dateparser ipython youtube-dl diff --git a/archivebox/core/admin.py b/archivebox/core/admin.py index 4337e4a3..a35d589b 100644 --- a/archivebox/core/admin.py +++ b/archivebox/core/admin.py @@ -66,6 +66,12 @@ class SnapshotAdmin(admin.ModelAdmin): actions = [delete_snapshots, overwrite_snapshots, update_snapshots, update_titles, verify_snapshots] actions_template = 'admin/actions_as_select.html' + def get_queryset(self, request): + return super().get_queryset(request).prefetch_related('tags') + + def tag_list(self, obj): + return u", ".join(o.name for o in obj.tags.all()) + def id_str(self, obj): return format_html( '{}', @@ -75,9 +81,9 @@ class SnapshotAdmin(admin.ModelAdmin): def title_str(self, obj): canon = obj.as_link().canonical_outputs() tags = ''.join( - format_html('{}', tag.strip()) - for tag in obj.tags.split(',') - ) if obj.tags else '' + format_html(' {} ', tag) + for tag in obj.tags.all() + ) if obj.tags.all() else '' return format_html( '' '' diff --git a/archivebox/core/migrations/0006_auto_20200915_2006.py b/archivebox/core/migrations/0006_auto_20200915_2006.py new file mode 100644 index 00000000..59bb111e --- /dev/null +++ b/archivebox/core/migrations/0006_auto_20200915_2006.py @@ -0,0 +1,89 @@ +# Generated by Django 3.0.8 on 2020-09-15 20:06 + +from django.db import migrations, models +from django.contrib.contenttypes.models import ContentType +from django.utils.text import slugify +import django.db.models.deletion +import taggit.managers + +def forwards_func(apps, schema_editor): + SnapshotModel = apps.get_model("core", "Snapshot") + TaggedItemModel = apps.get_model("core", "TaggedItem") + TagModel = apps.get_model("taggit", "Tag") + contents = ContentType.objects.all() + try: + ct = ContentType.objects.filter(app_label="core", model="snapshot") + except model.DoesNotExist: # Be explicit about exceptions + ct = None + + db_alias = schema_editor.connection.alias + snapshots = SnapshotModel.objects.all() + for snapshot in snapshots: + tags = snapshot.tags + tag_set = ( + set(tag.strip() for tag in (snapshot.tags_old or '').split(',')) + ) + tag_list = list(tag_set) or [] + + for tag in tag_list: + new_tag, created = TagModel.objects.get_or_create(name=tag, slug=slugify(tag)) + TaggedItemModel.objects.get_or_create( + content_type_id=ct[0].id, + object_id=snapshot.id, + tag=new_tag + ) + + +def reverse_func(apps, schema_editor): + SnapshotModel = apps.get_model("core", "Snapshot") + TaggedItemModel = apps.get_model("core", "TaggedItem") + TagModel = apps.get_model("taggit", "Tag") + ct = ContentType.objects.get(app_label="core", model="snapshot") + + db_alias = schema_editor.connection.alias + snapshots = SnapshotModel.objects.all() + for snapshot in snapshots: + for tag in tags: + tagged_items = TaggedItemModel.objects.filter( + object_id=snapshot.id, + ).delete() + + +class Migration(migrations.Migration): + + dependencies = [ + ('contenttypes', '0002_remove_content_type_name'), + ('taggit', '0003_taggeditem_add_unique_index'), + ('core', '0005_auto_20200728_0326'), + ] + + operations = [ + migrations.RenameField( + model_name='snapshot', + old_name='tags', + new_name='tags_old', + ), + migrations.CreateModel( + name='TaggedItem', + fields=[ + ('id', models.AutoField(auto_created=True, primary_key=True, serialize=False, verbose_name='ID')), + ('object_id', models.UUIDField(db_index=True, verbose_name='object ID')), + ('content_type', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='core_taggeditem_tagged_items', to='contenttypes.ContentType', verbose_name='content type')), + ('tag', models.ForeignKey(on_delete=django.db.models.deletion.CASCADE, related_name='core_taggeditem_items', to='taggit.Tag')), + ], + options={ + 'verbose_name': 'Tag', + 'verbose_name_plural': 'Tags', + }, + ), + migrations.AddField( + model_name='snapshot', + name='tags', + field=taggit.managers.TaggableManager(help_text='A comma-separated list of tags.', through='core.TaggedItem', to='taggit.Tag', verbose_name='Tags'), + ), + migrations.RunPython(forwards_func, reverse_func), + migrations.RemoveField( + model_name='snapshot', + name='tags_old', + ), + ] diff --git a/archivebox/core/models.py b/archivebox/core/models.py index 313dd67d..b7719b2e 100644 --- a/archivebox/core/models.py +++ b/archivebox/core/models.py @@ -5,10 +5,19 @@ import uuid from django.db import models from django.utils.functional import cached_property +from taggit.managers import TaggableManager +from taggit.models import GenericUUIDTaggedItemBase, TaggedItemBase + from ..util import parse_date from ..index.schema import Link + +class TaggedItem(GenericUUIDTaggedItemBase, TaggedItemBase): + class Meta: + verbose_name = "Tag" + verbose_name_plural = "Tags" + class Snapshot(models.Model): id = models.UUIDField(primary_key=True, default=uuid.uuid4, editable=False) @@ -16,7 +25,7 @@ class Snapshot(models.Model): timestamp = models.CharField(max_length=32, unique=True, db_index=True) title = models.CharField(max_length=128, null=True, blank=True, db_index=True) - tags = models.CharField(max_length=256, null=True, blank=True, db_index=True) + tags = TaggableManager(through=TaggedItem) added = models.DateTimeField(auto_now_add=True, db_index=True) updated = models.DateTimeField(null=True, blank=True, db_index=True) diff --git a/archivebox/core/settings.py b/archivebox/core/settings.py index 14b3b369..6ae2b6af 100644 --- a/archivebox/core/settings.py +++ b/archivebox/core/settings.py @@ -31,6 +31,7 @@ INSTALLED_APPS = [ 'core', 'django_extensions', + 'taggit', ] diff --git a/archivebox/index/__init__.py b/archivebox/index/__init__.py index 06832dbc..f93a4ab8 100644 --- a/archivebox/index/__init__.py +++ b/archivebox/index/__init__.py @@ -86,9 +86,16 @@ def merge_links(a: Link, b: Link) -> Link: ) # all unique, truthy tags + tags_a = [] + if a.tags: + tags_a = a.tags.all() + tags_b = [] + if b.tags: + tags_b = b.tags.all() + tags_set = ( - set(tag.strip() for tag in (a.tags or '').split(',')) - | set(tag.strip() for tag in (b.tags or '').split(',')) + set(tag.name.strip() for tag in tags_a) + | set(tag.name.strip() for tag in tags_b) ) tags = ','.join(tags_set) or None diff --git a/archivebox/index/schema.py b/archivebox/index/schema.py index 7508890d..7ed44e74 100644 --- a/archivebox/index/schema.py +++ b/archivebox/index/schema.py @@ -157,7 +157,8 @@ class Link: assert isinstance(self.url, str) and '://' in self.url assert self.updated is None or isinstance(self.updated, datetime) assert self.title is None or (isinstance(self.title, str) and self.title) - assert self.tags is None or isinstance(self.tags, str) + #for tag in self.tags.all(): + # assert tag is None or isinstance(tag, TaggedItem) assert isinstance(self.sources, list) assert all(isinstance(source, str) and source for source in self.sources) assert isinstance(self.history, dict) diff --git a/archivebox/index/sql.py b/archivebox/index/sql.py index b3ca7231..bd3664da 100644 --- a/archivebox/index/sql.py +++ b/archivebox/index/sql.py @@ -65,7 +65,14 @@ def write_sql_link_details(link: Link, out_dir: Path=OUTPUT_DIR) -> None: except Snapshot.DoesNotExist: snap = write_link_to_sql_index(link) snap.title = link.title - snap.tags = link.tags + + tag_set = ( + set(tag.strip() for tag in (link.tags or '').split(',')) + ) + tag_list = list(tag_set) or [] + + for tag in tag_list: + snap.tags.add(tag) snap.save() diff --git a/setup.py b/setup.py index db83e9bf..0272f565 100755 --- a/setup.py +++ b/setup.py @@ -80,6 +80,7 @@ setuptools.setup( "base32-crockford==0.3.0", "django==3.0.8", "django-extensions==3.0.3", + "django-taggit==1.3.0", "dateparser", "ipython", diff --git a/tests/tags_migration/index.sqlite3 b/tests/tags_migration/index.sqlite3 new file mode 100755 index 0000000000000000000000000000000000000000..04d35a71e68e8460936ae8f525bcfc169e53e967 GIT binary patch literal 167936 zcmWFz^vNtqRY=P(%1ta$FlG>7U}R))P*7lCVA#&Uz@Q1mj0_A6%?u0-EDQ($5<~E? zv$iwn$!uU{U|?Y6{|!DQiIJs+l?bw46J))H#_@)xrbbxx8k<^}8d?b> z>orEwYi4W^Z)jj_f?cnnsga455VBq)WW7eF@rI^GhFJ9)TAEr|=nJwkh^oT;i$8!2 zO-w9|9RyezMBN<|OG`2g3=9q9OAAsGOH$(tit;Ne;|o%Yax)>pjonH^b0Z^XepUui zUziC7@rgMlsYUVWMfs%#@p*~4sqwjq74bQ#dFdq?SPd~SGch!E=VN6M4TTzFU0=pq%{l%)I!#(j2Ur z!NlCs*v!a*6Dh6@Opwh083YcS{DKnf9x*pEH!^VMK#FSvV`LLja}zUj2*kCCnW>?H zkq0{~gJ?L+D@IU55NVB=$Tl%GH!w8vU}I%ab$3J#Mo=im7p3Orm%)QK9u&S{L$St? zsiCo&CUCxo zWMljVqluxJfvJ%>BP)X{sGvZkKtd&|iGhi+g)uMx6$Uo`B1Zm~{44p3`1kYA;_n_- z^~|VeMnhmU1V%$(Gz3ONU^E0qLtr!nMnhmU1V%$(Gz3ONU_e4ZoR!&_F(oT8FFhYN zVgw%2VCUs!R^$M6`5~iiJUq2b=nOPU64Lq&^8i-&LW@FZ71S>)exX3Ut zFoK$Ez|LREz~9Y(kN++I1^&(amHhKCbdL&*hQMeD zjE2By2#kinXb6mkz-S1JhQMeDjE2By2#kinXb6xK0)i}}vW%dGOvQPL1;rWpCG0{h z;G(m0?j;W<*{jfy8FvWf2tx4+ww;4Vd{@M8&~8@cJDl zAr?_Z@Q?yxumH6FpPj#gf&UHvUHK&1uX~w_5XR9SQtdbK|FB(Ux<-~K~xdMMfCqc^ZzouXBhb3 z@?YcM&%cU)7JoZ`34bzw2)`4*0lxx27vD#|8+-@(*7MEcYvn8Ai|6y?Gv<@$8G%+&ID z*rc+IC<~)DBYR3}PHIUiTvQq&nw*iCmyQsX5@KOgWn>4PgaFen2{w`qY$R+_S^}(@ z4QeDzR2(7-H4-K&1~!rnWF$;T6fDFFHWJ#S6%k=!)MR9Zng|sXh6q9pgbE4?vM?$$ zvVzQmiU^9aFzPb0fQ>`y2Mb8BFd8zlK#fF_;D<;+jYX2+1ABl4WHgc}FFy;TJR>vM zcuSg~S|%;>?oF zd}c9bW6sdzjMB11g^<+b34(=Q>L}!urj{iZDD%*~!7_*+~VKVSb6Jp85fVrGtBa(*s`YC{V>GfQIwV^h%n ze@6cA4E*1RPRNeBbTkA;Ltr!nMnhmU1V%$(Gz3ONU^E0qLtr!nMnhmU1V%%Es1V>_ z7Uu+y5rF3ZN7w%o6=tJ~MnhmU1V%$(Gz3ONU^E0qLtr!nMnhmU1V%$(Gz3ONV918R zX#ao6M%}2xM?+vV1V%$(Gz3ONU^E0qLtr!nMnhmU1V%$(Gz3ONfankat^b$dk7eNh z!~dQC6aQQO=ll=(Z}VT}KhJ-X|1kev{_Xr5`B(EV<)6<#lYcUQFMm6KBY!o2DStkH zCVw)2EYTK?DjW@g(GVC7fzc2c4S~@R7!85Z5Eu=C(GVC7fzc2c4T0er0<6r8jO;RC zQW{K3fk{aa$tD3N#lfT)m=pz(tRi4i7)%O*NkI_FA^;}&!6YA;7`~Sl+R!99b8UmvsFd71*Aut*OqaiRF0;3@?8UmvsFd71*Aut*O zs39=g|3?jvQQl|>jE2By2#kinXb6mkz-S1JhQMeDjE2By2#kinXb23)5E$+M564&? z_0wnwjE2By2#kinXb6mkz-S1JhQMeDjE2By2#kinXb7N&z-a#;H9SUnqaiRF0;3@? z8UmvsFd71*Aut*OqaiRF0;3@?8UmvsFdRdGm;EaPKjU!*eign;oLAWn@s{&+v5ByT zvrOgw&S}r#%5B4Zg7G+$KjS2Z3klAD%2=erG42o3nc8yeU3=Q%3^m7h!^>y_N(NR!JOfJdH&xyDJhgC7A0pS7HJq67-%BvP0mQnOHYl@ zO)V}?Oixu(fEWOEmX3l_a(-S(YFcO&hWiy+1Jt{k z3N9dZLEtd)^z#hyboB9wgbKR4=qMr6n2hAf-f`3k|xooWyh` zh2q@AoSe+O5{1&d;>`5C)D(C)C^&n#I(sW!S2UUEClrhvhy(%p+y1AL?p*!XaRWv;UC3bfU66J0T!w5MUra?)w$$%SN^tA|si$1U`z;OarQjE^$ zfw~7;^r1_Gf&)}S!OIU=4#!vYfvv-;i?VV7>;q7gV3x!f27}c?T~DAS)8tlT2%u&) z)mSddA>PR8$k?QhNHO3*0TYPig~+-HF;MDEU>GZ{5aUfEiD;a%n?}vB8zZ7Bx7KRazfpMoINlU zfo#K?-0@`+h$&dLQIeQ3;)IC2f(QkuPY7ig1_lN(%nZ|LA;vDA?8w*{i5B_b2t-(d zNcD-9rm2Z3hDnH2fUZ;$8vYpR0bLjaF&EaoLTG@77G_C>FGdmz3Xu9orlu(UBamBh%O(~S zeyT~8H5D0T%F0~$qY*~No385=FZ1G3OzuHsbiL~?v; zML}j!YCLGBKR(ek+04ky%otUP26PwfoB8Dbmse1;}8sCf9QYA=D5wHf0lc zS7c-W2OV^1F|o8HAIy(OI5pl7!In2PW)pXn$Eyi6+6vVJVsbZY8nKJZ%QLnqB0>Ws z0X7xH;KC9XAaSySq9C!jxIDio1u@oWXk>vh5}A`&Tmo8hl9`7*+!*TPqobgdSsY(n zT98@=LDm6xm6deVn)ZE0(9E2l{OifT6335nD zVp^J#0=gqIi{nA7G0IXgrNAjYD?c+2>qx4Of)d#4sLn!3ESilf>g?jN+9(t2h=KZe z zoLJGcU|5ixpOOmlb-aOLVv@1BX(D0{2H`9=sG}kI1!WNiLJ^Th6YUf5!XSv_lr)=E zRoOuk11*e*jDoNl&69Fa=OPy%2*sdKBe@ho4`$5Kf8?1JXc>sC32H0hITl!iAkx1f z+HxvTbRlGokS7x~8$DFm#q*6B8>_*wkHsPJi3J7mIf+TBIq|vqDXBT}=4MGL#wo^0 z;E2bffdv|xsF9CFA%2q~@s33sC}iMn0?pEcou#DN=&8&uZfwlhWC@OPEIP3Wu|gdS zje0C9K-Q5Q{RsEL7rCLw3dGger8OG`6p&|Zq0WM@@Pp5|CYzd>q$VYrp(~JqdJ9@= zpi5)uM+t1WH6X8pN^kHogGLQ`cJX*!#>PP8lmb_anVU_Ll2a{GlhEy!Cg3#WOaLku zlcDC|b|RV%B~6~~43?<%Gm$*YQZ{j4bzC)aa(+>2d~sf4L2*WYNxTsXSJkqFc&&yg zT9upRirK~0)ftmu66Sf32F(n08omyG| zs&-MPE<=5MG#MBeM9^cf(X5(XTvL;=NgW($&{PI(uA=f(P=gIs5|qZFy5h4_D-ljM zL26^5g&|CRN@7VO=IT`B<}2pXSWW91HgSJVTxkr|aq)(ze6>cFDt2*Cb;d?ja3(}a z0rAk#i#Ik-F-?r35xX`_^e0K zhQoG{8YRuTa@?LT$t=l9jW;nhPcbzzLe1vvcx^|~hs%19A|*}BGU7`bW0U|;AD#b4 zFGfcBqaiRF0;3@?8UmvsFd71*Aut*OqaiRF0;3@?8UmvsF#N!1|9|+!>8OuJLtr!n zMnhmU1V%$(Gz3ONU^E0qLtr!nMnhmU1V%#uJp@Mk|L7qy${!7Z(GVC7fzc2c4S~@R z7!85Z5Eu=C(GVC7fzc2c4T0eo0-*js6aQBR{@?sxhoA38eKZ;ZqaiRF0;3@?8Umvs zFd71*Aut*OqaiRF0;3@?8UiCY1X!6F8QGawm>D_QA^rdF4E%rizmMP$8TI;T2#kin zXb6mkz-S1JhQMeDjE2By2#kinXb6mkz-R~z#}Hs)=451IX6EGNWnf@n;%i{wH{oj- zjvgKL(`X2chQMeDjE2By2#kinXb6mkz-S1JhQMeDjE2DA2!TdJCN2iY&_>ZwGG~L8Ba|>M)3j+&X3!|h|-L&LH zQ$rIYV{?-<(`Z980|R3-GZRa5Jp)r?a|_d0g^ZGtf?_Lu{qpj1z0%~2(z3*o)Z`4k zQrlg=KOH4_Dt5KBz zxz3Oc`Xn`RNK^5lwNBrKW5Ux1cD52q7sIgD8bK21O}E2%%J2 z6y!jCHjo2z^3%aKf$Ro35N@@S2uP7GE673d(0lwqia>k?C|^yOm4Q*ykQL$pH>J$W785otdSwRkiS^=^W{T!pRyB8s9xlNTK5EFcF%Z3bBkat+AQ zP$95mWO!H^80FQOL9PVfDhg5r;wXSF4pra=`QurjpJvoJKY zG%#Y8Wp?C5-B@90Vr6QsXJBS(VPOX8|3l9IxCc6K z4|Lof@@adxPu9aYh7Zd@drC=}=?W={C5cKp3duS7r71cJC8^1I`8oONhu0?RDCCu< zmL(P|BUK9kvHLYY%kP9_XY!bLpItnSNC5bsXsp#%Unww`}U}z>cs;H!oi+?)< z6aORz{z?3I`0r53&{1PXLtr!nMnhmU1V%$(Gz3ONU^E0qLtr!nMnhmU1V%$(Gz18Q zfC{rOBcag@W(EcZRRZQPFeo$oauOQG;RTr@z`(#adXhgVBu4RQ2#kinXb6mkz-S1J zhQMeDjE2By2#kinXb6mkz#t9*Mtr;D2eG>cgVP}U|4%URpWuJQ|7b9{Xw(g(Aut*O zqaiRF0;3@?8UmvsFd71*Aut*OqaiRF0;3@?8Um&`fi>i1 z|3}yUV}|3X$Y=