# coding: utf-8 from telegram import Bot from lxml import html import requests from lxml.etree import tostring as htmlstring from urllib.parse import urljoin, urlparse import time import re import asyncio from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.common.exceptions import TimeoutException #linkTemplate = '/html[1]/body[1]/div[2]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]' linkTemplate = '/html[1]/body[1]/div[1]/div[4]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]' debug = False countStrPath = '/home/nenas/telegram/hardwareChecker/countStr.txt' if debug else '/home/nenas/telegram/hardwareChecker/countStr.txt' def sendMessage(bot, channelID, message): if debug: print(message) else: loop = asyncio.get_event_loop() loop.run_until_complete(bot.sendMessage(channelID, message)) def getNewContent(): fireFoxOptions = webdriver.FirefoxOptions() fireFoxOptions.add_argument("--headless") browser = webdriver.Firefox(options=fireFoxOptions) for i in range(0, 3): browser.get('https://www.hardwareluxx.de/community/members/hwl-news-bot.268095/#recent-content') try: WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, linkTemplate.format(index='1')))) return browser except TimeoutException: if debug: print("Timeout on WebDriverWait") browser.quit() return -1 def getLinksForID(browser, id): elements = browser.find_elements("xpath", linkTemplate.format(index=(str(id)))) links = [elem.get_attribute('href') for elem in elements] if len(links) != 1: return -1 return links[0] def checkForUpdate(browser): xpathStr = '//dt[contains(text(),\'Beiträge\')]/../dd/a' count = browser.find_elements("xpath", xpathStr) if len(count) != 1: return -1 countStr = count[0].text.replace('\n', '') countStr = countStr.replace('.', '') count = int(countStr) line = '' with open (countStrPath, 'r+') as countStrFile: line = countStrFile.readline() if line == '': fileCount = -1 else: fileCount = int(line) if count != fileCount: with open (countStrPath, 'w') as countStrFile: countStrFile.write(countStr) if (count - fileCount) < 0: return 0 else: return count - fileCount return 0 def end(ret, browser): browser.quit() return ret def main(): token = "1495297410:AAGCwqqUZVdcc6RjnyWtl7XtO0K2tz4EpDQ" channelID = '-1001418260700' masterRace = '-2466381126' bot = Bot(token) browser = getNewContent() if browser == -1: # sendMessage(bot, channelID, 'Bot kaputt: Kann dynamic Content nicht laden, Hardwareluxx-Server sind im Eimer') return -1 try: ret = checkForUpdate(browser) if debug: print(f'Updates: {ret}') if ret == 0: return end(0, browser) if (ret < 0): sendMessage(bot, channelID, 'Bot kaputt: Kann static Content nicht laden') return end(-1, browser) links = [] for index in range(ret): link = getLinksForID(browser, index + 1) if link == -1: sendMessage(bot, channelID, 'Bot kaputt: Kann Links nicht laden') return end(-1, browser) links.insert(0, link) for link in links: linkStr = str(link) post = re.findall(r'post-[0-9]*$', linkStr) if len(post) > 1: sendMessage(bot, channelID, 'Bot kaputt: Kann Post-ID nicht ermitteln: ' + linkStr) continue if len(post) == 1: postID = post[0][5:] linkToPost = 'https://www.hardwareluxx.de/community/posts/' + postID + '/' page = requests.get(linkToPost) tree = html.fromstring(page.content) postElement = tree.xpath('//article[@id=\'js-post-'+ postID + '\']/div/div[2]/div/div/div/article/div[1]') if len(postElement) != 1: sendMessage(bot, channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln') continue postElementStr = str(htmlstring(postElement[0], encoding='unicode')) match = re.search(r'.*Name: (.*)