Fixes bot

This commit is contained in:
2025-02-10 19:42:10 +01:00
parent 8b5b2bd887
commit 2604fc490e

93
bot.py
View File

@@ -1,14 +1,14 @@
# coding: utf-8 # coding: utf-8
from telegram.ext import Updater from telegram import Bot
from lxml import html from lxml import html
import requests import requests
from lxml.etree import tostring as htmlstring from lxml.etree import tostring as htmlstring
from urllib.parse import urljoin, urlparse
import time import time
import re import re
import asyncio
from dotenv import dotenv_values
from selenium import webdriver from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support.ui import WebDriverWait
@@ -16,32 +16,38 @@ from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException from selenium.common.exceptions import TimeoutException
linkTemplate = '/html[1]/body[1]/div[2]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]' #linkTemplate = '/html[1]/body[1]/div[2]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]'
linkTemplate = '/html[1]/body[1]/div[1]/div[4]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]'
debug = False debug = False
countStrPath = '/home/nenas/projects/python/hardwareChecker/countStr.txt' if debug else '/home/nenas/telegram/hardwareChecker/countStr.txt' countStrPath = '/home/nenas/telegram/hardwareChecker/countStr.txt' if debug else '/home/nenas/telegram/hardwareChecker/countStr.txt'
def sendMessage(updater, channelID, message):
def sendMessage(bot, channelID, message):
if debug: if debug:
print(message) print(message)
else: else:
updater.bot.sendMessage(channelID, message) loop = asyncio.get_event_loop()
loop.run_until_complete(bot.sendMessage(channelID, message))
def getNewContent(): def getNewContent():
fireFoxOptions = webdriver.FirefoxOptions() fireFoxOptions = webdriver.FirefoxOptions()
fireFoxOptions.headless = True fireFoxOptions.add_argument("--headless")
browser = webdriver.Firefox(options=fireFoxOptions) browser = webdriver.Firefox(options=fireFoxOptions)
browser.get('https://www.hardwareluxx.de/community/members/hwl-news-bot.268095/#recent-content')
try: for i in range(0, 3):
WebDriverWait(browser, 120).until(EC.presence_of_element_located((By.XPATH, linkTemplate.format(index='1')))) browser.get('https://www.hardwareluxx.de/community/members/hwl-news-bot.268095/#recent-content')
return browser try:
except TimeoutException: WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, linkTemplate.format(index='1'))))
return -1 return browser
except TimeoutException:
if debug:
print("Timeout on WebDriverWait")
browser.quit()
return -1
def getLinksForID(browser, id): def getLinksForID(browser, id):
elements = browser.find_elements_by_xpath(linkTemplate.format(index=(str(id)))) elements = browser.find_elements("xpath", linkTemplate.format(index=(str(id))))
links = [elem.get_attribute('href') for elem in elements] links = [elem.get_attribute('href') for elem in elements]
if len(links) != 1: if len(links) != 1:
@@ -50,7 +56,7 @@ def getLinksForID(browser, id):
def checkForUpdate(browser): def checkForUpdate(browser):
xpathStr = '//dt[contains(text(),\'Beiträge\')]/../dd/a' xpathStr = '//dt[contains(text(),\'Beiträge\')]/../dd/a'
count = browser.find_elements_by_xpath(xpathStr) count = browser.find_elements("xpath", xpathStr)
if len(count) != 1: if len(count) != 1:
return -1 return -1
@@ -86,34 +92,31 @@ def end(ret, browser):
def main(): def main():
config = dotenv_values(".env") token = "1495297410:AAGCwqqUZVdcc6RjnyWtl7XtO0K2tz4EpDQ"
channelID = '-1001418260700'
bot = Bot(token)
token = config["TOKEN"]
channelID = config["CHANNELID"]
updater = Updater(token)
browser = getNewContent() browser = getNewContent()
if browser == -1: if browser == -1:
sendMessage(updater, channelID, 'Bot kaputt: Kann dynamic Content nicht laden') # sendMessage(bot, channelID, 'Bot kaputt: Kann dynamic Content nicht laden, Hardwareluxx-Server sind im Eimer')
return -1 return -1
try: try:
ret = checkForUpdate(browser) ret = checkForUpdate(browser)
if debug:
print(f'Updates: {ret}')
if ret == 0: if ret == 0:
return end(0, browser) return end(0, browser)
if (ret < 0): if (ret < 0):
sendMessage(updater, channelID, 'Bot kaputt: Kann static Content nicht laden') sendMessage(bot, channelID, 'Bot kaputt: Kann static Content nicht laden')
return end(-1, browser) return end(-1, browser)
links = [] links = []
for index in range(ret): for index in range(ret):
link = getLinksForID(browser, index + 1) link = getLinksForID(browser, index + 1)
if link == -1: if link == -1:
sendMessage(updater, channelID, 'Bot kaputt: Kann Links nicht laden') sendMessage(bot, channelID, 'Bot kaputt: Kann Links nicht laden')
return end(-1, browser) return end(-1, browser)
links.insert(0, link) links.insert(0, link)
@@ -121,7 +124,7 @@ def main():
linkStr = str(link) linkStr = str(link)
post = re.findall(r'post-[0-9]*$', linkStr) post = re.findall(r'post-[0-9]*$', linkStr)
if len(post) > 1: if len(post) > 1:
sendMessage(updater, channelID, 'Bot kaputt: Kann Post-ID nicht ermitteln: ' + linkStr) sendMessage(bot, channelID, 'Bot kaputt: Kann Post-ID nicht ermitteln: ' + linkStr)
continue continue
if len(post) == 1: if len(post) == 1:
postID = post[0][5:] postID = post[0][5:]
@@ -134,36 +137,42 @@ def main():
postElement = tree.xpath('//article[@id=\'js-post-'+ postID + '\']/div/div[2]/div/div/div/article/div[1]') postElement = tree.xpath('//article[@id=\'js-post-'+ postID + '\']/div/div[2]/div/div/div/article/div[1]')
if len(postElement) != 1: if len(postElement) != 1:
sendMessage(updater, channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln') sendMessage(bot, channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln')
continue continue
postElementStr = str(htmlstring(postElement[0], encoding='unicode')) postElementStr = str(htmlstring(postElement[0], encoding='unicode'))
match = re.search(r'.*Name: (.*)<br.*', postElementStr) match = re.search(r'.*Name: (.*)<br.*', postElementStr)
if match == None: if match == None:
sendMessage(updater, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen: ' + postElementStr) sendMessage(bot, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen: ' + postElementStr)
continue continue
hwName = match.group(1) hwName = match.group(1)
match = re.search(r'.*Shop: (.*)<br.*', postElementStr) match = re.search(r'.*Shop: (.*)<br.*', postElementStr)
if match == None: if match == None:
sendMessage(updater, channelID, 'Bot kaputt: Kann Shop nicht capturen: ' + postElementStr) sendMessage(bot, channelID, 'Bot kaputt: Kann Shop nicht capturen: ' + postElementStr)
continue continue
shop = match.group(1) shop = match.group(1)
match = re.search(r'.*Preis: (.*)<br.*', postElementStr) match = re.search(r'.*Preis: (.*)<br.*', postElementStr)
if match == None: if match == None:
sendMessage(updater, channelID, 'Bot kaputt: Kann Shop nicht capturen: ' + postElementStr) sendMessage(bot, channelID, 'Bot kaputt: Kann Shop nicht capturen: ' + postElementStr)
continue continue
preis = match.group(1) preis = match.group(1)
match = re.search(r'href\="([^"]+).*', postElementStr) match = re.search(r'href\="([^"]+).*', postElementStr)
if match == None: if match == None:
sendMessage(updater, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen: ' + postElementStr) sendMessage(bot, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen: ' + postElementStr)
continue continue
url = match.group(1) url = match.group(1)
r = requests.head(url, allow_redirects=True)
message = 'Name: ' + hwName + '\nPreis: ' + preis + '\nShop: ' + shop + '\nURL: ' + url clean_url = urljoin(r.url, urlparse(r.url).path)
sendMessage(updater, channelID, message) if "notebooksbilliger" in r.url:
clean_url = r.url.split("?")[0]
if debug:
message = 'Name: ' + hwName + '\nPreis: ' + preis + '\nShop: ' + shop + '\nURL: ' + url + '\nresolved: ' + r.url + '\nnoQuery: ' + clean_url
else:
message = 'Name: ' + hwName + '\nPreis: ' + preis + '\nShop: ' + shop + '\nURL: ' + clean_url
sendMessage(bot, channelID, message)
else: else:
page = requests.get(linkStr) page = requests.get(linkStr)
@@ -171,21 +180,21 @@ def main():
postElement = tree.xpath('//a[contains(text(), \'weiterlesen\')]') postElement = tree.xpath('//a[contains(text(), \'weiterlesen\')]')
if len(postElement) != 1: if len(postElement) != 1:
sendMessage(updater, channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln') sendMessage(bot, channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln')
return end(-1, browser) return end(-1, browser)
postElementStr = str(htmlstring(postElement[0], encoding='unicode')) postElementStr = str(htmlstring(postElement[0], encoding='unicode'))
match = re.search(r'href\="([^"]+).*', postElementStr) match = re.search(r'href\="([^"]+).*', postElementStr)
if match == None: if match == None:
sendMessage(updater, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen.') sendMessage(bot, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen.')
continue continue
url = match.group(1) url = match.group(1)
sendMessage(updater, channelID, 'News: ' + url) sendMessage(bot, channelID, 'News: ' + url)
return end(0, browser) return end(0, browser)
except Exception as e: except Exception as e:
browser.close() browser.quit()
sendMessage(updater, channelID, str(e)) sendMessage(bot, channelID, str(e))
if __name__ == "__main__": if __name__ == "__main__":
main() main()