gardening

This commit is contained in:
Czar 2025-04-09 21:31:37 -03:00
commit cecf31720c
2 changed files with 291 additions and 235 deletions

351
angel.py
View file

@ -1,37 +1,7 @@
import requests
import bs4
import yt_dlp as youtube_dl
import random
import re
import os
import asyncio import asyncio
from collections import defaultdict from collections import defaultdict
from slixmpp import ClientXMPP from slixmpp import ClientXMPP
from urllib.parse import urlparse, parse_qs, urlunparse from slixmpp.stanza import Message
from pantomime import normalize_mimetype
import cgi
import ipaddress
import io
parser = "html.parser"
user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:10.0)"
" Gecko/20100101 Firefox/10.0"
accept_lang = "en-US"
data_limit = 100000000 # 100MB
headers = {
"user-agent": user_agent,
"Accept-Language": accept_lang,
"Cache-Control": "no-cache",
}
youtube_link = "youtu.be"
ydl = youtube_dl.YoutubeDL()
req_list = ("http://", "https://")
html_files = ("text/html", "application/xhtml+xml")
class Lifo(list): class Lifo(list):
"""Limited size LIFO array to store messages and urls.""" """Limited size LIFO array to store messages and urls."""
@ -47,30 +17,24 @@ class Lifo(list):
if len(self) > self.size: if len(self) > self.size:
self.pop() self.pop()
def create_messages_dict():
return defaultdict(
lambda: {
"messages": Lifo(100),
"links": Lifo(10),
"previews": Lifo(10),
}
)
def get_youtube_title(url):
"""Get the title of a youtube video."""
try:
info = ydl.extract_info(url, download=False)
return info["title"]
except Exception as e:
print(e)
return None
def get_yurl(path):
"""Get a youtube link from a path."""
yurl = f"https://youtu.be/{path}"
return yurl
# decorator to define a regex command
class RegexCmd: class RegexCmd:
"""Regex command decorator.""" """Regex command decorator."""
def __init__(self, bot, pattern): def __init__(self, bot, pattern, block=False, matcher=None):
"""Initialize the decorator.""" """Initialize the decorator."""
self.pattern = pattern self.pattern = pattern
self.bot = bot self.bot = bot
self.block = block
self.matcher = matcher
def __call__(self, func): def __call__(self, func):
"""Call the decorator.""" """Call the decorator."""
@ -81,126 +45,66 @@ class RegexCmd:
class AngelBot(ClientXMPP): class AngelBot(ClientXMPP):
"""AngelBot class.""" """AngelBot class."""
messages = defaultdict( def __init__(self, jid, password, nick="angel", autojoin=None,
lambda: { youtube_links=None,
"messages": Lifo(100), invidious_instances=None):
"links": Lifo(10), """Initialize the bot."""
"previews": Lifo(10), super().__init__(jid, password)
} self.jid = jid
self.nick = nick
self.autojoin = autojoin or []
self.invidious_instances = invidious_instances or []
self.youtube_links = youtube_links or []
self.messages = create_messages_dict()
self.register_plugins()
self.add_handlers()
def reply(self, msg, body):
"""Reply to a message."""
self.save_message_history(msg)
self.raw_reply(msg, body)
def raw_reply(self, msg, body):
"""Reply to a message without saving history."""
self.send_message(
mto=msg["from"].bare,
mbody=body,
mtype=msg["type"],
) )
def save_message_history(self, msg):
"""Save the history of messages."""
sender = msg["from"].bare
self.messages[sender]["messages"].add(msg["body"])
def get_message_history(self, msg):
"""Get the messages from the sender."""
sender = msg["from"].bare
return self.messages[sender]["messages"]
def save_link_history(self, msg, url):
"""Save the history of links."""
sender = msg["from"].bare
self.messages[sender]["links"].add(url)
def get_link_history(self, msg):
"""Get the links from the sender."""
sender = msg["from"].bare
return self.messages[sender]["links"]
def save_preview_history(self, msg, preview):
"""Save the history of previews."""
sender = msg["from"].bare
self.messages[sender]["previews"].add(preview)
def get_preview_history(self, msg):
"""Get the previews from the sender."""
sender = msg["from"].bare
return self.messages[sender]["previews"]
regex_cmds = [] regex_cmds = []
def get_urls(self, msg): async def embed_file(self, sender, mtype, ftype, fname, outfile):
"""Get urls from a message."""
str_list = msg["body"].strip().split()
urls = [u for u in str_list if any(r in u for r in req_list)]
return urls
def get_invidious_link(self, yurl):
"""Get an invidious link from a youtube link."""
video = yurl.split("/")[-1]
instance = random.choice(self.invidious_instances)
return f"https://{instance}/watch?v={video}"
def send_youtube_info(self, uri, sender, mtype):
"""Send youtube info to the sender."""
yurl = None
if uri.netloc == youtube_link:
yurl = get_yurl(uri.path)
elif "v" in (query := parse_qs(uri.query)):
if v := query["v"]:
yurl = get_yurl(v[0])
else:
return
invidious = self.get_invidious_link(yurl)
if output := get_youtube_title(invidious):
if output in self.messages[sender]["previews"]:
return
self.messages[sender]["previews"].add(output)
self.send_message(mto=sender, mbody=f"*{output}*", mtype=mtype)
self.send_message(mto=sender, mbody=invidious, mtype=mtype)
async def parse_uri(self, uri, sender, mtype):
"""Parse a uri and send the result to the sender."""
netloc = uri.netloc
if self.invidious_instances and netloc in (self.youtube_links + [youtube_link]):
self.send_youtube_info(uri, sender, mtype)
return
try:
if ipaddress.ip_address(netloc.split(":")[0]).is_private:
return
except ValueError:
pass
await self.process_link(uri, sender, mtype)
async def process_link(self, uri, sender, mtype):
"""Process a link and send the result to the sender."""
url = urlunparse(uri)
r = requests.get(url, stream=True, headers=headers, timeout=6)
if not r.ok:
return
ftype = normalize_mimetype(r.headers.get("content-type"))
if not ftype:
return
if ftype in html_files:
data = ""
for i in r.iter_content(chunk_size=1024, decode_unicode=False):
data += i.decode("utf-8", errors="ignore")
if len(data) > data_limit or "</head>" in data.lower():
break
soup = bs4.BeautifulSoup(data, parser)
if title := soup.find("title"):
output = title.text.strip()
if output:
output = f"*{output}*" if ("\n" not in output) else output
if output in self.messages[sender]["previews"]:
return
self.messages[sender]["previews"].add(output)
if r.history:
self.send_message(mto=sender, mbody=r.url, mtype=mtype)
self.send_message(mto=sender, mbody=output, mtype=mtype)
else:
try:
lenght = 0
outfile = io.BytesIO()
for chunk in r.iter_content(
chunk_size=512,
decode_unicode=False,
):
lenght += 512
if lenght >= data_limit:
return
outfile.write(chunk)
content_disposition = r.headers.get("content-disposition")
filename = None
if content_disposition:
_, params = cgi.parse_header(content_disposition)
filename = params.get("filename")
if params.get("filename*"):
filename = params.get("filename*")
filename = filename.split("''")[-1]
else:
filename = os.path.basename(uri.path)
ext = os.path.splitext(filename)[1] if filename else ".txt"
fname = filename if filename else f"file{ext}"
await self.embed_file(url, sender, mtype, ftype, fname, outfile)
except Exception as e:
print(e)
async def embed_file(self, url, sender, mtype, ftype, fname, outfile):
"""Embed a file and send the result to the sender.""" """Embed a file and send the result to the sender."""
furl = await self.plugin["xep_0363"].upload_file( furl = await self.plugin["xep_0363"].upload_file(
fname, content_type=ftype, input_file=outfile fname, content_type=ftype, input_file=outfile
@ -212,32 +116,7 @@ class AngelBot(ClientXMPP):
message["oob"]["url"] = furl message["oob"]["url"] = furl
message.send() message.send()
async def parse_urls(self, msg, urls, sender, mtype): def register_plugins(self):
"""Parse urls and send the result to the sender."""
body = msg["body"].lower()
if "nsfw" in body: return
if "nsfl" in body: return
for u in urls:
if u in self.messages[sender]["links"]:
continue
else:
self.messages[sender]["links"].add(u)
uri = urlparse(u)
await self.parse_uri(uri, sender, mtype)
def __init__(self, jid, password, nick="angel", autojoin=None,
youtube_links=None,
invidious_instances=None):
"""Initialize the bot."""
ClientXMPP.__init__(self, jid, password)
self.jid = jid
self.nick = nick
self.autojoin = autojoin or []
self.invidious_instances = invidious_instances or []
self.youtube_links = youtube_links or []
self.register_plugin("xep_0030") self.register_plugin("xep_0030")
self.register_plugin("xep_0060") self.register_plugin("xep_0060")
self.register_plugin("xep_0054") self.register_plugin("xep_0054")
@ -247,6 +126,7 @@ class AngelBot(ClientXMPP):
self.register_plugin("xep_0153") self.register_plugin("xep_0153")
self.register_plugin("xep_0363") self.register_plugin("xep_0363")
def add_handlers(self):
self.add_event_handler("session_start", self.session_start) self.add_event_handler("session_start", self.session_start)
self.add_event_handler("message", self.message) self.add_event_handler("message", self.message)
self.add_event_handler("groupchat_message", self.muc_message) self.add_event_handler("groupchat_message", self.muc_message)
@ -310,14 +190,6 @@ class AngelBot(ClientXMPP):
mtype = msg["type"] mtype = msg["type"]
sender = msg["from"].bare sender = msg["from"].bare
try:
if not msg["oob"]["url"]:
if urls := self.get_urls(msg):
await self.parse_urls(msg, urls, sender, mtype)
except Exception as e:
print(e)
self.process_commands(msg, sender, mtype) self.process_commands(msg, sender, mtype)
async def muc_message(self, msg): async def muc_message(self, msg):
@ -334,14 +206,6 @@ class AngelBot(ClientXMPP):
mtype = msg["type"] mtype = msg["type"]
sender = msg["from"].bare sender = msg["from"].bare
try:
if not msg["oob"]["url"]:
if urls := self.get_urls(msg):
await self.parse_urls(msg, urls, sender, mtype)
except Exception as e:
print(e)
self.process_commands(msg, sender, mtype) self.process_commands(msg, sender, mtype)
@ -349,5 +213,78 @@ class AngelBot(ClientXMPP):
"""Process commands.""" """Process commands."""
for cmd in self.regex_cmds: for cmd in self.regex_cmds:
if cmd.pattern.match(msg["body"]): if cmd.pattern.match(msg["body"]):
return cmd.func(self, msg, sender, mtype) ctx = CommandContext(self, msg)
if cmd.matcher and not cmd.matcher(ctx):
continue
cmd.func(ctx)
if(cmd.block):
return
self.messages[sender]["messages"].add(msg["body"]) self.messages[sender]["messages"].add(msg["body"])
class CommandContext:
"""Command context."""
def __init__(self, bot: AngelBot, msg: Message):
"""Initialize the command context."""
self.bot = bot
self.msg = msg
def reply(self, body):
"""Get the reply function."""
return self.bot.reply(self.msg, body)
@property
def sender(self):
"""Get the sender of the message."""
return self.msg["from"].bare
@property
def mtype(self):
"""Get the message type."""
return self.msg["type"]
@property
def body(self):
"""Get the message body."""
return self.msg["body"]
@property
def raw_reply(self, body):
"""Get the raw reply function."""
return self.bot.raw_reply(self.msg, body)
@property
def message_history(self):
"""Get the message history."""
return self.bot.get_message_history(self.msg)
@property
def link_history(self):
"""Get the link history."""
return self.bot.get_link_history(self.msg)
@property
def preview_history(self):
"""Get the preview history."""
return self.bot.get_preview_history(self.msg)
def save_link_history(self, url):
"""Save the link history."""
self.bot.save_link_history(self.msg, url)
def save_message_history(self):
"""Save the message history."""
self.bot.save_message_history(self.msg)
def save_preview_history(self, preview):
"""Save the preview history."""
self.bot.save_preview_history(self.msg, preview)
@property
def is_oob(self):
"""Check if the message is OOB."""
return bool(self.msg["oob"]["url"])
def embed_file(self, ftype, fname, outfile):
"""Embed a file and send the result to the sender."""
asyncio.gather(self.bot.embed_file(self.sender, self.mtype, ftype, fname, outfile))

177
main.py
View file

@ -1,8 +1,15 @@
from angel import AngelBot, RegexCmd from angel import AngelBot, RegexCmd, CommandContext
from configparser import ConfigParser from configparser import ConfigParser
from PythonSed import Sed from PythonSed import Sed
import re import re
import io import io
from urllib.parse import urlparse, parse_qs, urlunparse
from pantomime import normalize_mimetype
import cgi
import ipaddress
import bs4
import requests
import os
sed_parse = re.compile("(?<!\\\\)[/#]") sed_parse = re.compile("(?<!\\\\)[/#]")
sed_cmd = re.compile("^s[/#].*[/#].*[/#]") sed_cmd = re.compile("^s[/#].*[/#].*[/#]")
@ -23,46 +30,158 @@ bot = AngelBot(jid, password, nick=nick, autojoin=autojoin,
youtube_links=youtube_links, youtube_links=youtube_links,
invidious_instances=invidious_instances) invidious_instances=invidious_instances)
@RegexCmd(bot, sed_cmd)
def sed_command(bot, msg, sender, mtype): def default_matcher(ctx: CommandContext) -> bool:
if ctx.is_oob:
return False
body = ctx.body.lower()
return "nsfw" not in body and "nsfl" not in body
@RegexCmd(bot, sed_cmd, block=True)
def sed_command(ctx: CommandContext):
"""Process sed command.""" """Process sed command."""
try: try:
text = msg["body"] text = ctx.body
if not sed_cmd.match(text):
bot.messages[sender]["messages"].add(text)
return
sed_args = sed_parse.split(text) sed_args = sed_parse.split(text)
if len(sed_args) < 4:
return
sed = Sed() sed = Sed()
sed.load_string(text) sed.load_string(text)
pattern = re.compile(sed_args[1])
for message in bot.messages[sender]["messages"]: for history_message in ctx.message_history:
if not re.search(sed_args[1], message): if not pattern.search(history_message):
continue continue
msg = io.StringIO(message) msg = io.StringIO(history_message)
res = "\n".join(sed.apply(msg, None)) response = "\n".join(sed.apply(msg, None))
bot.messages[sender]["messages"].add(res) return ctx.reply(response)
return bot.send_message(
mto=sender,
mbody=res,
mtype=mtype,
)
except Exception as e: except Exception as e:
print(e) print(e)
# ping command
@RegexCmd(bot, re.compile(r"^ping$")) @RegexCmd(bot, re.compile(r"^ping$"))
def ping_command(bot, msg, sender, mtype): def ping_command(ctx: CommandContext):
"""Process ping command.""" """Process ping command."""
bot.send_message( ctx.reply("pong")
mto=sender,
mbody="pong",
mtype=mtype,
)
@RegexCmd(bot, re.compile(r"^https?://"), matcher=default_matcher)
def url_command(ctx: CommandContext):
"""Process url command."""
urls = get_urls(ctx.body)
if not urls:
return
parse_urls(ctx, urls)
# URL parsing
req_list = ("http://", "https://")
html_files = ("text/html", "application/xhtml+xml")
parser = "html.parser"
user_agent = "Mozilla/5.0 (X11; Linux x86_64; rv:10.0)"
" Gecko/20100101 Firefox/10.0"
accept_lang = "en-US"
data_limit = 100000000 # 100MB
headers = {
"user-agent": user_agent,
"Accept-Language": accept_lang,
"Cache-Control": "no-cache",
}
def get_urls(body):
"""Get urls from a message."""
str_list = body.strip().split()
urls = [u for u in str_list if any(r in u for r in req_list)]
return urls
def is_private(uri):
"""Check if a uri is private."""
netloc = uri.netloc
try:
if ipaddress.ip_address(netloc.split(":")[0]).is_private:
return True
except ValueError:
pass
return False
def preview_page(ctx: CommandContext, r):
data = ""
for i in r.iter_content(chunk_size=1024, decode_unicode=False):
data += i.decode("utf-8", errors="ignore")
if len(data) > data_limit or "</head>" in data.lower():
break
soup = bs4.BeautifulSoup(data, parser)
if title := soup.find("title"):
output = title.text.strip()
if output:
output = f"*{output}*" if ("\n" not in output) else output
if output in ctx.preview_history:
return
ctx.save_preview_history(output)
if r.history:
ctx.raw_reply(r.url)
ctx.reply(output)
def preview_file(ctx: CommandContext, uri, ftype, r):
try:
lenght = 0
outfile = io.BytesIO()
for chunk in r.iter_content(
chunk_size=512,
decode_unicode=False,
):
lenght += 512
if lenght >= data_limit:
return
outfile.write(chunk)
content_disposition = r.headers.get("content-disposition")
filename = None
if content_disposition:
_, params = cgi.parse_header(content_disposition)
filename = params.get("filename")
if params.get("filename*"):
filename = params.get("filename*")
filename = filename.split("''")[-1]
else:
filename = os.path.basename(uri.path)
ext = os.path.splitext(filename)[1] if filename else ".txt"
fname = filename if filename else f"file{ext}"
ctx.embed_file(ftype, fname, outfile)
except Exception as e:
print(e)
def process_link(ctx: CommandContext, uri):
"""Process a link and send the result to the sender."""
url = urlunparse(uri)
r = requests.get(url, stream=True, headers=headers, timeout=6)
if not r.ok:
return
ftype = normalize_mimetype(r.headers.get("content-type"))
if not ftype:
return
if ftype in html_files:
preview_page(ctx, r)
else:
preview_file(ctx, uri, ftype, r)
def parse_urls(ctx: CommandContext, urls):
"""Parse urls and send the result to the sender."""
for u in urls:
if u in ctx.link_history:
continue
ctx.save_link_history(u)
uri = urlparse(u)
if is_private(uri):
continue
process_link(ctx, uri)
bot.connect() bot.connect()
bot.process(forever=True) bot.process(forever=True)