# coding: utf-8 from telegram.ext import Updater from lxml import html import requests from lxml.etree import tostring as htmlstring import time import re from dotenv import dotenv_values from selenium import webdriver from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By from selenium.common.exceptions import TimeoutException linkTemplate = '/html[1]/body[1]/div[2]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]' debug = False countStrPath = '/home/nenas/projects/python/hardwareChecker/countStr.txt' if debug else '/home/nenas/telegram/hardwareChecker/countStr.txt' def sendMessage(updater, channelID, message): if debug: print(message) else: updater.bot.sendMessage(channelID, message) def getNewContent(): fireFoxOptions = webdriver.FirefoxOptions() fireFoxOptions.headless = True browser = webdriver.Firefox(options=fireFoxOptions) browser.get('https://www.hardwareluxx.de/community/members/hwl-news-bot.268095/#recent-content') try: WebDriverWait(browser, 120).until(EC.presence_of_element_located((By.XPATH, linkTemplate.format(index='1')))) return browser except TimeoutException: return -1 def getLinksForID(browser, id): elements = browser.find_elements_by_xpath(linkTemplate.format(index=(str(id)))) links = [elem.get_attribute('href') for elem in elements] if len(links) != 1: return -1 return links[0] def checkForUpdate(browser): xpathStr = '//dt[contains(text(),\'Beiträge\')]/../dd/a' count = browser.find_elements_by_xpath(xpathStr) if len(count) != 1: return -1 countStr = count[0].text.replace('\n', '') countStr = countStr.replace('.', '') count = int(countStr) line = '' with open (countStrPath, 'r+') as countStrFile: line = countStrFile.readline() if line == '': fileCount = -1 else: fileCount = int(line) if count != fileCount: with open (countStrPath, 'w') as countStrFile: countStrFile.write(countStr) if (count - fileCount) < 0: return 0 else: return count - fileCount return 0 def end(ret, browser): browser.quit() return ret def main(): config = dotenv_values(".env") token = config["TOKEN"] channelID = config["CHANNELID"] updater = Updater(token) browser = getNewContent() if browser == -1: sendMessage(updater, channelID, 'Bot kaputt: Kann dynamic Content nicht laden') return -1 try: ret = checkForUpdate(browser) if ret == 0: return end(0, browser) if (ret < 0): sendMessage(updater, channelID, 'Bot kaputt: Kann static Content nicht laden') return end(-1, browser) links = [] for index in range(ret): link = getLinksForID(browser, index + 1) if link == -1: sendMessage(updater, channelID, 'Bot kaputt: Kann Links nicht laden') return end(-1, browser) links.insert(0, link) for link in links: linkStr = str(link) post = re.findall(r'post-[0-9]*$', linkStr) if len(post) > 1: sendMessage(updater, channelID, 'Bot kaputt: Kann Post-ID nicht ermitteln: ' + linkStr) continue if len(post) == 1: postID = post[0][5:] linkToPost = 'https://www.hardwareluxx.de/community/posts/' + postID + '/' page = requests.get(linkToPost) tree = html.fromstring(page.content) postElement = tree.xpath('//article[@id=\'js-post-'+ postID + '\']/div/div[2]/div/div/div/article/div[1]') if len(postElement) != 1: sendMessage(updater, channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln') continue postElementStr = str(htmlstring(postElement[0], encoding='unicode')) match = re.search(r'.*Name: (.*)