diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..44c9665 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +.env +geckodriver.log +countStr.txt \ No newline at end of file diff --git a/bot.py b/bot.py new file mode 100644 index 0000000..78bfe90 --- /dev/null +++ b/bot.py @@ -0,0 +1,191 @@ +# coding: utf-8 + +from telegram.ext import Updater +from lxml import html +import requests +from lxml.etree import tostring as htmlstring + +import time +import re + +from dotenv import dotenv_values + +from selenium import webdriver +from selenium.webdriver.support.ui import WebDriverWait +from selenium.webdriver.support import expected_conditions as EC +from selenium.webdriver.common.by import By +from selenium.common.exceptions import TimeoutException + +linkTemplate = '/html[1]/body[1]/div[2]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]' + +debug = False +countStrPath = '/home/nenas/projects/python/hardwareChecker/countStr.txt' if debug else '/home/nenas/telegram/hardwareChecker/countStr.txt' + +def sendMessage(updater, channelID, message): + if debug: + print(message) + else: + updater.bot.sendMessage(channelID, message) + +def getNewContent(): + fireFoxOptions = webdriver.FirefoxOptions() + fireFoxOptions.headless = True + browser = webdriver.Firefox(options=fireFoxOptions) + + browser.get('https://www.hardwareluxx.de/community/members/hwl-news-bot.268095/#recent-content') + + try: + WebDriverWait(browser, 120).until(EC.presence_of_element_located((By.XPATH, linkTemplate.format(index='1')))) + return browser + except TimeoutException: + return -1 + +def getLinksForID(browser, id): + elements = browser.find_elements_by_xpath(linkTemplate.format(index=(str(id)))) + links = [elem.get_attribute('href') for elem in elements] + + if len(links) != 1: + return -1 + return links[0] + +def checkForUpdate(browser): + xpathStr = '//dt[contains(text(),\'Beiträge\')]/../dd/a' + count = browser.find_elements_by_xpath(xpathStr) + + if len(count) != 1: + return -1 + + + countStr = count[0].text.replace('\n', '') + countStr = countStr.replace('.', '') + + count = int(countStr) + + line = '' + + with open (countStrPath, 'r+') as countStrFile: + line = countStrFile.readline() + + if line == '': + fileCount = -1 + else: + fileCount = int(line) + + if count != fileCount: + with open (countStrPath, 'w') as countStrFile: + countStrFile.write(countStr) + if (count - fileCount) < 0: + return 0 + else: + return count - fileCount + return 0 + +def end(ret, browser): + browser.quit() + return ret + +def main(): + + config = dotenv_values(".env") + + + token = config["TOKEN"] + channelID = config["CHANNELID"] + + updater = Updater(token) + + browser = getNewContent() + if browser == -1: + sendMessage(updater, channelID, 'Bot kaputt: Kann dynamic Content nicht laden') + return -1 + try: + ret = checkForUpdate(browser) + + if ret == 0: + return end(0, browser) + + + if (ret < 0): + sendMessage(updater, channelID, 'Bot kaputt: Kann static Content nicht laden') + return end(-1, browser) + + links = [] + for index in range(ret): + link = getLinksForID(browser, index + 1) + if link == -1: + sendMessage(updater, channelID, 'Bot kaputt: Kann Links nicht laden') + return end(-1, browser) + links.insert(0, link) + + for link in links: + linkStr = str(link) + post = re.findall(r'post-[0-9]*$', linkStr) + if len(post) > 1: + sendMessage(updater, channelID, 'Bot kaputt: Kann Post-ID nicht ermitteln: ' + linkStr) + continue + if len(post) == 1: + postID = post[0][5:] + + linkToPost = 'https://www.hardwareluxx.de/community/posts/' + postID + '/' + + page = requests.get(linkToPost) + + tree = html.fromstring(page.content) + + postElement = tree.xpath('//article[@id=\'js-post-'+ postID + '\']/div/div[2]/div/div/div/article/div[1]') + if len(postElement) != 1: + sendMessage(updater, channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln') + continue + postElementStr = str(htmlstring(postElement[0], encoding='unicode')) + + match = re.search(r'.*Name: (.*) 1: + #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann Post-ID nicht ermitteln') + print ('Bot kaputt: Kann Post-ID nicht ermitteln') + return -1 + if len(post) == 1: + postID = post[0][5:] + + #print(post[0][5:]) + linkToPost = 'https://www.hardwareluxx.de/community/posts/' + postID + '/' + + page = requests.get(linkToPost) + + tree = html.fromstring(page.content) + + postElement = tree.xpath('//article[@id=\'js-post-'+ postID + '\']/div/div[2]/div/div/div/article/div[1]') + if len(postElement) != 1: + #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln') + print ('Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln') + continue + postElementStr = str(htmlstring(postElement[0], encoding='unicode')) + #print(postElementStr) + #updater.bot.sendMessage(channelID, postElementStr) + + match = re.search(r'.*Name: (.*)