Files
HardwareChecker/bot.py

204 lines
7.3 KiB
Python

# coding: utf-8
from telegram import Bot
from lxml import html
import requests
from lxml.etree import tostring as htmlstring
from urllib.parse import urljoin, urlparse
import time
import re
import asyncio
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
#linkTemplate = '/html[1]/body[1]/div[2]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]'
linkTemplate = '/html[1]/body[1]/div[1]/div[4]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]'
debug = False
countStrPath = '/home/nenas/telegram/hardwareChecker/countStr.txt' if debug else '/home/nenas/telegram/hardwareChecker/countStr.txt'
def sendMessage(bot, channelID, message):
if debug:
print(message)
else:
loop = asyncio.get_event_loop()
loop.run_until_complete(bot.sendMessage(channelID, message))
def getNewContent():
fireFoxOptions = webdriver.FirefoxOptions()
fireFoxOptions.add_argument("--headless")
browser = webdriver.Firefox(options=fireFoxOptions)
for i in range(0, 3):
browser.get('https://www.hardwareluxx.de/community/members/hwl-news-bot.268095/#recent-content')
try:
WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, linkTemplate.format(index='1'))))
return browser
except TimeoutException:
if debug:
print("Timeout on WebDriverWait")
browser.quit()
return -1
def getLinksForID(browser, id):
elements = browser.find_elements("xpath", linkTemplate.format(index=(str(id))))
links = [elem.get_attribute('href') for elem in elements]
if len(links) != 1:
return -1
return links[0]
def checkForUpdate(browser):
xpathStr = '//dt[contains(text(),\'Beiträge\')]/../dd/a'
count = browser.find_elements("xpath", xpathStr)
if len(count) != 1:
return -1
countStr = count[0].text.replace('\n', '')
countStr = countStr.replace('.', '')
count = int(countStr)
line = ''
with open (countStrPath, 'r+') as countStrFile:
line = countStrFile.readline()
if line == '':
fileCount = -1
else:
fileCount = int(line)
if count != fileCount:
with open (countStrPath, 'w') as countStrFile:
countStrFile.write(countStr)
if (count - fileCount) < 0:
return 0
else:
return count - fileCount
return 0
def end(ret, browser):
browser.quit()
return ret
def main():
token = "1495297410:AAGCwqqUZVdcc6RjnyWtl7XtO0K2tz4EpDQ"
channelID = '-1001418260700'
masterRace = '-2466381126'
bot = Bot(token)
browser = getNewContent()
if browser == -1:
# sendMessage(bot, channelID, 'Bot kaputt: Kann dynamic Content nicht laden, Hardwareluxx-Server sind im Eimer')
return -1
try:
ret = checkForUpdate(browser)
if debug:
print(f'Updates: {ret}')
if ret == 0:
return end(0, browser)
if (ret < 0):
sendMessage(bot, channelID, 'Bot kaputt: Kann static Content nicht laden')
return end(-1, browser)
links = []
for index in range(ret):
link = getLinksForID(browser, index + 1)
if link == -1:
sendMessage(bot, channelID, 'Bot kaputt: Kann Links nicht laden')
return end(-1, browser)
links.insert(0, link)
for link in links:
linkStr = str(link)
post = re.findall(r'post-[0-9]*$', linkStr)
if len(post) > 1:
sendMessage(bot, channelID, 'Bot kaputt: Kann Post-ID nicht ermitteln: ' + linkStr)
continue
if len(post) == 1:
postID = post[0][5:]
linkToPost = 'https://www.hardwareluxx.de/community/posts/' + postID + '/'
page = requests.get(linkToPost)
tree = html.fromstring(page.content)
postElement = tree.xpath('//article[@id=\'js-post-'+ postID + '\']/div/div[2]/div/div/div/article/div[1]')
if len(postElement) != 1:
sendMessage(bot, channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln')
continue
postElementStr = str(htmlstring(postElement[0], encoding='unicode'))
match = re.search(r'.*Name: (.*)<br.*', postElementStr)
if match == None:
sendMessage(bot, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen: ' + postElementStr)
continue
hwName = match.group(1)
match = re.search(r'.*Shop: (.*)<br.*', postElementStr)
if match == None:
sendMessage(bot, channelID, 'Bot kaputt: Kann Shop nicht capturen: ' + postElementStr)
continue
shop = match.group(1)
match = re.search(r'.*Preis: (.*)<br.*', postElementStr)
if match == None:
sendMessage(bot, channelID, 'Bot kaputt: Kann Shop nicht capturen: ' + postElementStr)
continue
preis = match.group(1)
match = re.search(r'href\="([^"]+).*', postElementStr)
if match == None:
sendMessage(bot, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen: ' + postElementStr)
continue
url = match.group(1)
r = requests.head(url, allow_redirects=True)
clean_url = urljoin(r.url, urlparse(r.url).path)
if "notebooksbilliger" in r.url or ".msi.com" in r.url:
clean_url = r.url.split("?")[0]
if debug:
message = 'Name: ' + hwName + '\nPreis: ' + preis + '\nShop: ' + shop + '\nURL: ' + url + '\nresolved: ' + r.url + '\nnoQuery: ' + clean_url
else:
message = 'Name: ' + hwName + '\nPreis: ' + preis + '\nShop: ' + shop + '\nURL: ' + clean_url
sendMessage(bot, channelID, message)
if "5090" in hwName:
sendMessage(bot, masterRace, message)
else:
page = requests.get(linkStr)
tree = html.fromstring(page.content)
postElement = tree.xpath('//a[contains(text(), \'weiterlesen\')]')
if len(postElement) != 1:
sendMessage(bot, channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln')
return end(-1, browser)
postElementStr = str(htmlstring(postElement[0], encoding='unicode'))
match = re.search(r'href\="([^"]+).*', postElementStr)
if match == None:
sendMessage(bot, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen.')
continue
url = match.group(1)
sendMessage(bot, channelID, 'News: ' + url)
return end(0, browser)
except Exception as e:
browser.quit()
sendMessage(bot, channelID, str(e))
if __name__ == "__main__":
main()