major rewrite due to telegram topics

This commit is contained in:
Matthias Jacob 2023-06-28 05:47:20 +02:00
parent e0ceb3a968
commit 486a769aaf
1 changed files with 97 additions and 32 deletions

129
main.py
View File

@ -4,6 +4,7 @@ import re
from enum import Enum
from pprint import pprint
from urllib.parse import urlparse
import json
import pandas as pd
@ -32,6 +33,42 @@ SERVICES = {
'soundcloud.com': MusicSource.SOUNDCLOUD
}
class Link:
def __init__(self, link, reply_to_message_id):
self.link = link
self.reply_to_message_id = reply_to_message_id
self._source = None
def __str__(self):
return f"Source: {self.source()}, Link: {self.link}, Reply to Message ID: {self.reply_to_message_id}"
def __repr__(self):
return f"Link(link={repr(self.link)}, reply_to_message_id={repr(self.reply_to_message_id)})"
def source(self):
if self._source is None:
o = urlparse(self.link)
if re.match(r'([A-Za-z0-9\-]*\.)?bandcamp.com', o.hostname):
self._source = MusicSource.BANDCAMP
else:
self._source = SERVICES.get(o.hostname, MusicSource.OTHER)
return self._source
@staticmethod
def filter_links(links, music_source=None, reply_to_message_id=None):
filtered_links = links
if music_source is not None:
filtered_links = [link for link in filtered_links if link.source() == music_source]
if reply_to_message_id is not None:
filtered_links = [link for link in filtered_links if link.reply_to_message_id == reply_to_message_id]
return filtered_links
def _split_seq(iterable, size):
it = iter(iterable)
item = list(itertools.islice(it, size))
@ -39,43 +76,62 @@ def _split_seq(iterable, size):
yield item
item = list(itertools.islice(it, size))
def echo(link):
o = urlparse(link)
def print_filtered_messages(filtered_messages):
# Print the filtered messages
for message in filtered_messages:
reply_to_message_id = message["reply_to_message_id"]
link = message["link"]
print("Reply to Message ID:", reply_to_message_id)
print("Link:", link)
print()
if re.match(r'([A-Za-z0-9\-]*\.)?bandcamp.com', o.hostname):
return {'source': MusicSource.BANDCAMP, 'link': link}
print(len(filtered_messages))
return {'source': SERVICES.get(o.hostname, MusicSource.OTHER), 'link': link}
def extract_all_links(file_path):
# Load the JSON file
with open(file_path, "r", encoding="utf-8") as file:
data = json.load(file)
# Filter messages of type "message" and extract relevant information
filtered_messages = []
for message in data["messages"]:
if message["type"] == "message":
text_entries = message.get("text", [])
for text_entry in text_entries:
if isinstance(text_entry, dict) and text_entry.get("type") == "link":
reply_to_message_id = message.get("reply_to_message_id")
link = text_entry.get("text")
filtered_messages.append({"reply_to_message_id": reply_to_message_id, "link": link})
return filtered_messages
def update_spotify_from_export(messages):
links_w_source = [Link(link=row['link'], reply_to_message_id=row['reply_to_message_id'])
for row in messages]
def update_spotify_from_export():
df = pd.read_json("ChatExport_2022-09-18/result.json")
df1 = pd.json_normalize(df.messages)
reduced = df1[df1['type'] == 'message'][['id', 'type', 'text', 'from', 'from_id']]
get_links = pd.json_normalize(reduced.explode('text').text)
links = get_links[get_links['type'] == 'link']['text'].to_list()
links_w_source = [echo(l) for l in links]
pprint(links_w_source)
for l in links_w_source:
print(l)
spotify_links = []
yt_links = []
soundcloud_links = []
bandcamp = []
other_links = []
#yt_links = []
#soundcloud_links = []
#bandcamp = []
#other_links = []
for i in links_w_source:
if i['source'] == MusicSource.SPOTIFY:
spotify_links.append(i['link'])
elif i['source'] == MusicSource.YOUTUBE:
yt_links.append(i['link'])
elif i['source'] == MusicSource.SOUNDCLOUD:
soundcloud_links.append(i['link'])
elif i['source'] == MusicSource.BANDCAMP:
bandcamp.append(i['link'])
else:
other_links.append(i['link'])
# for i in links_w_source:
# if i.source() == MusicSource.SPOTIFY:
# spotify_links.append(i.link)
# elif i.source() == MusicSource.YOUTUBE:
# yt_links.append(i.link)
# elif i.source() == MusicSource.SOUNDCLOUD:
# soundcloud_links.append(i.link)
# elif i.source() == MusicSource.BANDCAMP:
# bandcamp.append(i.link)
# else:
# other_links.append(i.link)
spotify_links = [l.link for l in filter(lambda x: x.source() == MusicSource.SPOTIFY and x.reply_to_message_id is None, links_w_source)]
#print(spotify_links)
#print(yt_links)
@ -85,14 +141,21 @@ def update_spotify_from_export():
print(f'Spotify tracks: {len(spotify_links)}')
# clean playlist itself from spotify links
spotify_links = [s for s in spotify_links if not s.startswith('https://open.spotify.com/playlist/')]
# support for links with language codes
spotify_links = [s.split("/intl-")[0] + "/track" + s.split("/track")[1] if ("open.spotify.com/intl-" in s and "track" in s) else s for s in spotify_links]
print(sorted(spotify_links))
scope = "playlist-modify-private"
os.environ['SPOTIPY_REDIRECT_URI'] = 'http://127.0.0.1:9090'
sp = spotipy.Spotify(auth_manager=SpotifyOAuth(scope=scope))
sp.playlist(SPOTIFY_PLAYLIST_ID)
# clean playlist itself from spotify links
spotify_links = [s for s in spotify_links if not s.startswith('https://open.spotify.com/playlist/')]
# paginated update of spotify playlist
for i, sublist in enumerate(_split_seq(spotify_links, 100)):
@ -103,4 +166,6 @@ def update_spotify_from_export():
if __name__ == '__main__':
update_spotify_from_export()
filtered_messages = extract_all_links("ChatExport_2023-06-28/result.json")
update_spotify_from_export(filtered_messages)