Fixes bot
This commit is contained in:
93
bot.py
93
bot.py
@@ -1,14 +1,14 @@
|
|||||||
# coding: utf-8
|
# coding: utf-8
|
||||||
|
|
||||||
from telegram.ext import Updater
|
from telegram import Bot
|
||||||
from lxml import html
|
from lxml import html
|
||||||
import requests
|
import requests
|
||||||
from lxml.etree import tostring as htmlstring
|
from lxml.etree import tostring as htmlstring
|
||||||
|
from urllib.parse import urljoin, urlparse
|
||||||
|
|
||||||
import time
|
import time
|
||||||
import re
|
import re
|
||||||
|
import asyncio
|
||||||
from dotenv import dotenv_values
|
|
||||||
|
|
||||||
from selenium import webdriver
|
from selenium import webdriver
|
||||||
from selenium.webdriver.support.ui import WebDriverWait
|
from selenium.webdriver.support.ui import WebDriverWait
|
||||||
@@ -16,32 +16,38 @@ from selenium.webdriver.support import expected_conditions as EC
|
|||||||
from selenium.webdriver.common.by import By
|
from selenium.webdriver.common.by import By
|
||||||
from selenium.common.exceptions import TimeoutException
|
from selenium.common.exceptions import TimeoutException
|
||||||
|
|
||||||
linkTemplate = '/html[1]/body[1]/div[2]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]'
|
#linkTemplate = '/html[1]/body[1]/div[2]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]'
|
||||||
|
linkTemplate = '/html[1]/body[1]/div[1]/div[4]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]'
|
||||||
debug = False
|
debug = False
|
||||||
countStrPath = '/home/nenas/projects/python/hardwareChecker/countStr.txt' if debug else '/home/nenas/telegram/hardwareChecker/countStr.txt'
|
countStrPath = '/home/nenas/telegram/hardwareChecker/countStr.txt' if debug else '/home/nenas/telegram/hardwareChecker/countStr.txt'
|
||||||
|
|
||||||
def sendMessage(updater, channelID, message):
|
|
||||||
|
def sendMessage(bot, channelID, message):
|
||||||
if debug:
|
if debug:
|
||||||
print(message)
|
print(message)
|
||||||
else:
|
else:
|
||||||
updater.bot.sendMessage(channelID, message)
|
loop = asyncio.get_event_loop()
|
||||||
|
loop.run_until_complete(bot.sendMessage(channelID, message))
|
||||||
|
|
||||||
def getNewContent():
|
def getNewContent():
|
||||||
fireFoxOptions = webdriver.FirefoxOptions()
|
fireFoxOptions = webdriver.FirefoxOptions()
|
||||||
fireFoxOptions.headless = True
|
fireFoxOptions.add_argument("--headless")
|
||||||
browser = webdriver.Firefox(options=fireFoxOptions)
|
browser = webdriver.Firefox(options=fireFoxOptions)
|
||||||
|
|
||||||
browser.get('https://www.hardwareluxx.de/community/members/hwl-news-bot.268095/#recent-content')
|
|
||||||
|
|
||||||
try:
|
for i in range(0, 3):
|
||||||
WebDriverWait(browser, 120).until(EC.presence_of_element_located((By.XPATH, linkTemplate.format(index='1'))))
|
browser.get('https://www.hardwareluxx.de/community/members/hwl-news-bot.268095/#recent-content')
|
||||||
return browser
|
try:
|
||||||
except TimeoutException:
|
WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, linkTemplate.format(index='1'))))
|
||||||
return -1
|
return browser
|
||||||
|
except TimeoutException:
|
||||||
|
if debug:
|
||||||
|
print("Timeout on WebDriverWait")
|
||||||
|
browser.quit()
|
||||||
|
return -1
|
||||||
|
|
||||||
def getLinksForID(browser, id):
|
def getLinksForID(browser, id):
|
||||||
elements = browser.find_elements_by_xpath(linkTemplate.format(index=(str(id))))
|
elements = browser.find_elements("xpath", linkTemplate.format(index=(str(id))))
|
||||||
links = [elem.get_attribute('href') for elem in elements]
|
links = [elem.get_attribute('href') for elem in elements]
|
||||||
|
|
||||||
if len(links) != 1:
|
if len(links) != 1:
|
||||||
@@ -50,7 +56,7 @@ def getLinksForID(browser, id):
|
|||||||
|
|
||||||
def checkForUpdate(browser):
|
def checkForUpdate(browser):
|
||||||
xpathStr = '//dt[contains(text(),\'Beiträge\')]/../dd/a'
|
xpathStr = '//dt[contains(text(),\'Beiträge\')]/../dd/a'
|
||||||
count = browser.find_elements_by_xpath(xpathStr)
|
count = browser.find_elements("xpath", xpathStr)
|
||||||
|
|
||||||
if len(count) != 1:
|
if len(count) != 1:
|
||||||
return -1
|
return -1
|
||||||
@@ -86,34 +92,31 @@ def end(ret, browser):
|
|||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|
||||||
config = dotenv_values(".env")
|
token = "1495297410:AAGCwqqUZVdcc6RjnyWtl7XtO0K2tz4EpDQ"
|
||||||
|
channelID = '-1001418260700'
|
||||||
|
bot = Bot(token)
|
||||||
token = config["TOKEN"]
|
|
||||||
channelID = config["CHANNELID"]
|
|
||||||
|
|
||||||
updater = Updater(token)
|
|
||||||
|
|
||||||
browser = getNewContent()
|
browser = getNewContent()
|
||||||
if browser == -1:
|
if browser == -1:
|
||||||
sendMessage(updater, channelID, 'Bot kaputt: Kann dynamic Content nicht laden')
|
# sendMessage(bot, channelID, 'Bot kaputt: Kann dynamic Content nicht laden, Hardwareluxx-Server sind im Eimer')
|
||||||
return -1
|
return -1
|
||||||
try:
|
try:
|
||||||
ret = checkForUpdate(browser)
|
ret = checkForUpdate(browser)
|
||||||
|
if debug:
|
||||||
|
print(f'Updates: {ret}')
|
||||||
if ret == 0:
|
if ret == 0:
|
||||||
return end(0, browser)
|
return end(0, browser)
|
||||||
|
|
||||||
|
|
||||||
if (ret < 0):
|
if (ret < 0):
|
||||||
sendMessage(updater, channelID, 'Bot kaputt: Kann static Content nicht laden')
|
sendMessage(bot, channelID, 'Bot kaputt: Kann static Content nicht laden')
|
||||||
return end(-1, browser)
|
return end(-1, browser)
|
||||||
|
|
||||||
links = []
|
links = []
|
||||||
for index in range(ret):
|
for index in range(ret):
|
||||||
link = getLinksForID(browser, index + 1)
|
link = getLinksForID(browser, index + 1)
|
||||||
if link == -1:
|
if link == -1:
|
||||||
sendMessage(updater, channelID, 'Bot kaputt: Kann Links nicht laden')
|
sendMessage(bot, channelID, 'Bot kaputt: Kann Links nicht laden')
|
||||||
return end(-1, browser)
|
return end(-1, browser)
|
||||||
links.insert(0, link)
|
links.insert(0, link)
|
||||||
|
|
||||||
@@ -121,7 +124,7 @@ def main():
|
|||||||
linkStr = str(link)
|
linkStr = str(link)
|
||||||
post = re.findall(r'post-[0-9]*$', linkStr)
|
post = re.findall(r'post-[0-9]*$', linkStr)
|
||||||
if len(post) > 1:
|
if len(post) > 1:
|
||||||
sendMessage(updater, channelID, 'Bot kaputt: Kann Post-ID nicht ermitteln: ' + linkStr)
|
sendMessage(bot, channelID, 'Bot kaputt: Kann Post-ID nicht ermitteln: ' + linkStr)
|
||||||
continue
|
continue
|
||||||
if len(post) == 1:
|
if len(post) == 1:
|
||||||
postID = post[0][5:]
|
postID = post[0][5:]
|
||||||
@@ -134,36 +137,42 @@ def main():
|
|||||||
|
|
||||||
postElement = tree.xpath('//article[@id=\'js-post-'+ postID + '\']/div/div[2]/div/div/div/article/div[1]')
|
postElement = tree.xpath('//article[@id=\'js-post-'+ postID + '\']/div/div[2]/div/div/div/article/div[1]')
|
||||||
if len(postElement) != 1:
|
if len(postElement) != 1:
|
||||||
sendMessage(updater, channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln')
|
sendMessage(bot, channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln')
|
||||||
continue
|
continue
|
||||||
postElementStr = str(htmlstring(postElement[0], encoding='unicode'))
|
postElementStr = str(htmlstring(postElement[0], encoding='unicode'))
|
||||||
|
|
||||||
match = re.search(r'.*Name: (.*)<br.*', postElementStr)
|
match = re.search(r'.*Name: (.*)<br.*', postElementStr)
|
||||||
if match == None:
|
if match == None:
|
||||||
sendMessage(updater, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen: ' + postElementStr)
|
sendMessage(bot, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen: ' + postElementStr)
|
||||||
continue
|
continue
|
||||||
hwName = match.group(1)
|
hwName = match.group(1)
|
||||||
|
|
||||||
match = re.search(r'.*Shop: (.*)<br.*', postElementStr)
|
match = re.search(r'.*Shop: (.*)<br.*', postElementStr)
|
||||||
if match == None:
|
if match == None:
|
||||||
sendMessage(updater, channelID, 'Bot kaputt: Kann Shop nicht capturen: ' + postElementStr)
|
sendMessage(bot, channelID, 'Bot kaputt: Kann Shop nicht capturen: ' + postElementStr)
|
||||||
continue
|
continue
|
||||||
shop = match.group(1)
|
shop = match.group(1)
|
||||||
|
|
||||||
match = re.search(r'.*Preis: (.*)<br.*', postElementStr)
|
match = re.search(r'.*Preis: (.*)<br.*', postElementStr)
|
||||||
if match == None:
|
if match == None:
|
||||||
sendMessage(updater, channelID, 'Bot kaputt: Kann Shop nicht capturen: ' + postElementStr)
|
sendMessage(bot, channelID, 'Bot kaputt: Kann Shop nicht capturen: ' + postElementStr)
|
||||||
continue
|
continue
|
||||||
preis = match.group(1)
|
preis = match.group(1)
|
||||||
|
|
||||||
match = re.search(r'href\="([^"]+).*', postElementStr)
|
match = re.search(r'href\="([^"]+).*', postElementStr)
|
||||||
if match == None:
|
if match == None:
|
||||||
sendMessage(updater, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen: ' + postElementStr)
|
sendMessage(bot, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen: ' + postElementStr)
|
||||||
continue
|
continue
|
||||||
url = match.group(1)
|
url = match.group(1)
|
||||||
|
r = requests.head(url, allow_redirects=True)
|
||||||
message = 'Name: ' + hwName + '\nPreis: ' + preis + '\nShop: ' + shop + '\nURL: ' + url
|
clean_url = urljoin(r.url, urlparse(r.url).path)
|
||||||
sendMessage(updater, channelID, message)
|
if "notebooksbilliger" in r.url:
|
||||||
|
clean_url = r.url.split("?")[0]
|
||||||
|
if debug:
|
||||||
|
message = 'Name: ' + hwName + '\nPreis: ' + preis + '\nShop: ' + shop + '\nURL: ' + url + '\nresolved: ' + r.url + '\nnoQuery: ' + clean_url
|
||||||
|
else:
|
||||||
|
message = 'Name: ' + hwName + '\nPreis: ' + preis + '\nShop: ' + shop + '\nURL: ' + clean_url
|
||||||
|
sendMessage(bot, channelID, message)
|
||||||
else:
|
else:
|
||||||
page = requests.get(linkStr)
|
page = requests.get(linkStr)
|
||||||
|
|
||||||
@@ -171,21 +180,21 @@ def main():
|
|||||||
|
|
||||||
postElement = tree.xpath('//a[contains(text(), \'weiterlesen\')]')
|
postElement = tree.xpath('//a[contains(text(), \'weiterlesen\')]')
|
||||||
if len(postElement) != 1:
|
if len(postElement) != 1:
|
||||||
sendMessage(updater, channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln')
|
sendMessage(bot, channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln')
|
||||||
return end(-1, browser)
|
return end(-1, browser)
|
||||||
postElementStr = str(htmlstring(postElement[0], encoding='unicode'))
|
postElementStr = str(htmlstring(postElement[0], encoding='unicode'))
|
||||||
|
|
||||||
match = re.search(r'href\="([^"]+).*', postElementStr)
|
match = re.search(r'href\="([^"]+).*', postElementStr)
|
||||||
if match == None:
|
if match == None:
|
||||||
sendMessage(updater, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen.')
|
sendMessage(bot, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen.')
|
||||||
continue
|
continue
|
||||||
url = match.group(1)
|
url = match.group(1)
|
||||||
sendMessage(updater, channelID, 'News: ' + url)
|
sendMessage(bot, channelID, 'News: ' + url)
|
||||||
return end(0, browser)
|
return end(0, browser)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
browser.close()
|
browser.quit()
|
||||||
sendMessage(updater, channelID, str(e))
|
sendMessage(bot, channelID, str(e))
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
Reference in New Issue
Block a user