# coding: utf-8 from telegram.ext import Updater from lxml import html import requests from lxml.etree import tostring as htmlstring import time import re from dotenv import dotenv_values from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.common.exceptions import TimeoutException linkTemplate = '/html[1]/body[1]/div[2]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]' def getNewContent(): fireFoxOptions = webdriver.FirefoxOptions() fireFoxOptions.headless = True browser = webdriver.Firefox(options=fireFoxOptions) browser.get('https://www.hardwareluxx.de/community/members/hwl-news-bot.268095/#recent-content') try: WebDriverWait(browser, 120).until(EC.presence_of_element_located((By.XPATH, linkTemplate.format(index='1')))) return browser except TimeoutException: return -1 def getLinksForID(browser, id): elements = browser.find_elements_by_xpath(linkTemplate.format(index=(str(id)))) links = [elem.get_attribute('href') for elem in elements] if len(links) != 1: #TODO Fehler return -1 return links[0] countStrPath = 'countStr.txt' #TODO def checkForUpdate(): page = requests.get('https://www.hardwareluxx.de/community/members/hwl-news-bot.268095/#recent-content') tree = html.fromstring(page.content) count = tree.xpath('//dt[contains(text(),\'Beiträge\')]/../dd/a/text()') if len(count) != 1: return -1 countStr = count[0].replace('\n', '') countStr = countStr.replace('.', '') count = int(countStr) line = '' with open (countStrPath, 'r+') as countStrFile: line = countStrFile.readline() if line == '': fileCount = -1 else: fileCount = int(line) if count != fileCount: with open (countStrPath, 'w') as countStrFile: countStrFile.write(countStr) if (count - fileCount) < 0: return 0 else: return count - fileCount return 0 def main(): ret = checkForUpdate() if ret == 0: return 0 config = dotenv_values(".env") token = config["TOKEN"] channelID = config["CHANNELID"] updater = Updater(token) if (ret < 0): #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann static Content nicht laden') print('Bot kaputt: Kann static Content nicht laden') return -1 browser = getNewContent() if browser == -1: #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann dynamic Content nicht laden') print ('Bot kaputt: Kann dynamic Content nicht laden') links = [] for index in range(ret): link = getLinksForID(browser, index + 1) if link == -1: #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann Links nicht laden') print ('Bot kaputt: Kann Links nicht laden') return -1 links.insert(0, link) for link in links: linkStr = str(link) post = re.findall(r'post-[0-9]*$', linkStr) if len(post) > 1: #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann Post-ID nicht ermitteln') print ('Bot kaputt: Kann Post-ID nicht ermitteln') return -1 if len(post) == 1: postID = post[0][5:] #print(post[0][5:]) linkToPost = 'https://www.hardwareluxx.de/community/posts/' + postID + '/' page = requests.get(linkToPost) tree = html.fromstring(page.content) postElement = tree.xpath('//article[@id=\'js-post-'+ postID + '\']/div/div[2]/div/div/div/article/div[1]') if len(postElement) != 1: #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln') print ('Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln') continue postElementStr = str(htmlstring(postElement[0], encoding='unicode')) #print(postElementStr) #updater.bot.sendMessage(channelID, postElementStr) match = re.search(r'.*Name: (.*)