major rewrite due to telegram topics

2023-06-28 05:47:20 +02:00
parent e0ceb3a968
commit 486a769aaf
1 changed files with 97 additions and 32 deletions
@@ -4,6 +4,7 @@ import re
 from enum import Enum
 from pprint import pprint
 from urllib.parse import urlparse
+import json

 import pandas as pd

@@ -32,6 +33,42 @@ SERVICES = {
    'soundcloud.com': MusicSource.SOUNDCLOUD
 }

+class Link:
+    def __init__(self, link, reply_to_message_id):
+        self.link = link
+        self.reply_to_message_id = reply_to_message_id
+        self._source = None
+
+    def __str__(self):
+        return f"Source: {self.source()}, Link: {self.link}, Reply to Message ID: {self.reply_to_message_id}"
+    
+    def __repr__(self):
+        return f"Link(link={repr(self.link)}, reply_to_message_id={repr(self.reply_to_message_id)})"
+
+    def source(self):
+        if self._source is None:
+            o = urlparse(self.link)
+
+            if re.match(r'([A-Za-z0-9\-]*\.)?bandcamp.com', o.hostname):
+                self._source = MusicSource.BANDCAMP
+            else:
+                self._source = SERVICES.get(o.hostname, MusicSource.OTHER)
+
+        return self._source
+
+    
+    @staticmethod
+    def filter_links(links, music_source=None, reply_to_message_id=None):
+        filtered_links = links
+
+        if music_source is not None:
+            filtered_links = [link for link in filtered_links if link.source() == music_source]
+
+        if reply_to_message_id is not None:
+            filtered_links = [link for link in filtered_links if link.reply_to_message_id == reply_to_message_id]
+
+        return filtered_links
+
 def _split_seq(iterable, size):
    it = iter(iterable)
    item = list(itertools.islice(it, size))
@@ -39,43 +76,62 @@ def _split_seq(iterable, size):
        yield item
        item = list(itertools.islice(it, size))

-def echo(link):
-    o = urlparse(link)
+def print_filtered_messages(filtered_messages):
+    # Print the filtered messages
+    for message in filtered_messages:
+        reply_to_message_id = message["reply_to_message_id"]
+        link = message["link"]
+        print("Reply to Message ID:", reply_to_message_id)
+        print("Link:", link)
+        print()
    
-    if re.match(r'([A-Za-z0-9\-]*\.)?bandcamp.com', o.hostname):
-        return {'source': MusicSource.BANDCAMP, 'link': link}
+    print(len(filtered_messages))

-    return {'source': SERVICES.get(o.hostname, MusicSource.OTHER), 'link': link}
+def extract_all_links(file_path):
+    # Load the JSON file
+    with open(file_path, "r", encoding="utf-8") as file:
+        data = json.load(file)
+
+    # Filter messages of type "message" and extract relevant information
+    filtered_messages = []
+    for message in data["messages"]:
+        if message["type"] == "message":
+            text_entries = message.get("text", [])
+            for text_entry in text_entries:
+                if isinstance(text_entry, dict) and text_entry.get("type") == "link":
+                    reply_to_message_id = message.get("reply_to_message_id")
+                    link = text_entry.get("text")
+                    filtered_messages.append({"reply_to_message_id": reply_to_message_id, "link": link})
+
+    return filtered_messages


+def update_spotify_from_export(messages):
+    links_w_source = [Link(link=row['link'], reply_to_message_id=row['reply_to_message_id'])
+                      for row in messages]

-def update_spotify_from_export():
-    df = pd.read_json("ChatExport_2022-09-18/result.json")
-    df1 = pd.json_normalize(df.messages)
-    reduced = df1[df1['type'] == 'message'][['id', 'type', 'text', 'from', 'from_id']]
-    get_links = pd.json_normalize(reduced.explode('text').text)
-    links = get_links[get_links['type'] == 'link']['text'].to_list()
-    links_w_source = [echo(l) for l in links]
-
-    pprint(links_w_source)
+    for l in links_w_source:
+        print(l)

    spotify_links = []
-    yt_links = []
-    soundcloud_links = []
-    bandcamp = []
-    other_links = []
+    #yt_links = []
+    #soundcloud_links = []
+    #bandcamp = []
+    #other_links = []

-    for i in links_w_source:
-        if i['source'] == MusicSource.SPOTIFY:
-            spotify_links.append(i['link'])
-        elif i['source'] == MusicSource.YOUTUBE:
-            yt_links.append(i['link'])
-        elif i['source'] == MusicSource.SOUNDCLOUD:
-            soundcloud_links.append(i['link'])
-        elif i['source'] == MusicSource.BANDCAMP:
-            bandcamp.append(i['link'])
-        else:
-            other_links.append(i['link'])
+    # for i in links_w_source:
+    #     if i.source() == MusicSource.SPOTIFY:
+    #         spotify_links.append(i.link)
+    #     elif i.source() == MusicSource.YOUTUBE:
+    #         yt_links.append(i.link)
+    #     elif i.source() == MusicSource.SOUNDCLOUD:
+    #         soundcloud_links.append(i.link)
+    #     elif i.source() == MusicSource.BANDCAMP:
+    #         bandcamp.append(i.link)
+    #     else:
+    #         other_links.append(i.link)
+
+    spotify_links = [l.link for l in filter(lambda x: x.source() == MusicSource.SPOTIFY and x.reply_to_message_id is None, links_w_source)]

    #print(spotify_links)
    #print(yt_links)
@@ -85,14 +141,21 @@ def update_spotify_from_export():

    print(f'Spotify tracks: {len(spotify_links)}')

+    # clean playlist itself from spotify links
+    spotify_links = [s for s in spotify_links if not s.startswith('https://open.spotify.com/playlist/')]
+
+    # support for links with language codes 
+    spotify_links = [s.split("/intl-")[0] + "/track" + s.split("/track")[1] if ("open.spotify.com/intl-" in s and "track" in s) else s for s in spotify_links]
+
+
+    print(sorted(spotify_links))
+
    scope = "playlist-modify-private"
    os.environ['SPOTIPY_REDIRECT_URI'] = 'http://127.0.0.1:9090'

    sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))
    sp.playlist(SPOTIFY_PLAYLIST_ID)
    
-    # clean playlist itself from spotify links
-    spotify_links = [s for s in spotify_links if not s.startswith('https://open.spotify.com/playlist/')]

    # paginated update of spotify playlist
    for i, sublist in enumerate(_split_seq(spotify_links, 100)):
@@ -103,4 +166,6 @@ def update_spotify_from_export():


 if __name__ == '__main__':
-    update_spotify_from_export()
+    filtered_messages = extract_all_links("ChatExport_2023-06-28/result.json")
+    update_spotify_from_export(filtered_messages)
+