HardwareChecker/bot.py

# coding: utf-8

from telegram import Bot
from lxml import html
import requests
from lxml.etree import tostring as htmlstring
from urllib.parse import urljoin, urlparse

import time
import re
import asyncio

from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException

#linkTemplate = '/html[1]/body[1]/div[2]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]'
linkTemplate = '/html[1]/body[1]/div[1]/div[4]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]'
debug = False
countStrPath = '/home/nenas/telegram/hardwareChecker/countStr.txt' if debug else '/home/nenas/telegram/hardwareChecker/countStr.txt'


def sendMessage(bot, channelID, message):
    if debug:
        print(message)
    else:
        loop = asyncio.get_event_loop()
        loop.run_until_complete(bot.sendMessage(channelID, message))

def getNewContent():
    fireFoxOptions = webdriver.FirefoxOptions()
    fireFoxOptions.add_argument("--headless")
    browser = webdriver.Firefox(options=fireFoxOptions)


    for i in range(0, 3):
        browser.get('https://www.hardwareluxx.de/community/members/hwl-news-bot.268095/#recent-content')
        try:
            WebDriverWait(browser, 20).until(EC.presence_of_element_located((By.XPATH, linkTemplate.format(index='1'))))
            return browser
        except TimeoutException:
            if debug:
                print("Timeout on WebDriverWait")
    browser.quit()
    return -1

def getLinksForID(browser, id):
    elements = browser.find_elements("xpath", linkTemplate.format(index=(str(id))))
    links = [elem.get_attribute('href') for elem in elements]

    if len(links) != 1:
        return -1
    return links[0]

def checkForUpdate(browser):
    xpathStr = '//dt[contains(text(),\'Beiträge\')]/../dd/a'
    count = browser.find_elements("xpath", xpathStr)

    if len(count) != 1:
        return -1


    countStr = count[0].text.replace('\n', '')
    countStr = countStr.replace('.', '')

    count = int(countStr)

    line = ''

    with open (countStrPath, 'r+') as countStrFile:
        line = countStrFile.readline()

    if line == '':
        fileCount = -1
    else:
        fileCount = int(line)

    if count != fileCount:
        with open (countStrPath, 'w') as countStrFile:
            countStrFile.write(countStr)
        if (count - fileCount) < 0:
            return 0
        else:
            return count - fileCount
    return 0

def end(ret, browser):
    browser.quit()
    return ret

def main():

    token = "1495297410:AAGCwqqUZVdcc6RjnyWtl7XtO0K2tz4EpDQ"
    channelID = '-1001418260700'
    masterRace = '-2466381126'
    bot = Bot(token)

    browser = getNewContent()
    if browser == -1:
#        sendMessage(bot, channelID, 'Bot kaputt: Kann dynamic Content nicht laden, Hardwareluxx-Server sind im Eimer')
        return -1
    try:
        ret = checkForUpdate(browser)
        if debug:
            print(f'Updates: {ret}')
        if ret == 0:
            return end(0, browser)


        if (ret < 0):
            sendMessage(bot, channelID, 'Bot kaputt: Kann static Content nicht laden')
            return end(-1, browser)

        links = []
        for index in range(ret):
            link = getLinksForID(browser, index + 1)
            if link == -1:
                sendMessage(bot, channelID, 'Bot kaputt: Kann Links nicht laden')
                return end(-1, browser)
            links.insert(0, link)

        for link in links:
            linkStr = str(link)
            post = re.findall(r'post-[0-9]*$', linkStr)
            if len(post) > 1:
                sendMessage(bot, channelID, 'Bot kaputt: Kann Post-ID nicht ermitteln: ' + linkStr)
                continue
            if len(post) == 1:
                postID = post[0][5:]

                linkToPost = 'https://www.hardwareluxx.de/community/posts/' + postID + '/'

                page = requests.get(linkToPost)

                tree = html.fromstring(page.content)

                postElement = tree.xpath('//article[@id=\'js-post-'+ postID + '\']/div/div[2]/div/div/div/article/div[1]')
                if len(postElement) != 1:
                    sendMessage(bot, channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln')
                    continue
                postElementStr = str(htmlstring(postElement[0], encoding='unicode'))

                match =  re.search(r'.*Name: (.*)<br.*', postElementStr)
                if match == None:
                    sendMessage(bot, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen: ' + postElementStr)
                    continue
                hwName = match.group(1)

                match =  re.search(r'.*Shop: (.*)<br.*', postElementStr)
                if match == None:
                    sendMessage(bot, channelID, 'Bot kaputt: Kann Shop nicht capturen: ' + postElementStr)
                    continue
                shop = match.group(1)

                match =  re.search(r'.*Preis: (.*)<br.*', postElementStr)
                if match == None:
                    sendMessage(bot, channelID, 'Bot kaputt: Kann Shop nicht capturen: ' + postElementStr)
                    continue
                preis = match.group(1)

                match =  re.search(r'href\="([^"]+).*', postElementStr)
                if match == None:
                    sendMessage(bot, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen: ' + postElementStr)
                    continue
                url = match.group(1)
                r = requests.head(url, allow_redirects=True)
                clean_url = urljoin(r.url, urlparse(r.url).path)
                if "notebooksbilliger" in r.url or ".msi.com" in r.url:
                    clean_url = r.url.split("?")[0]
                if debug:
                    message = 'Name: ' + hwName + '\nPreis: ' + preis + '\nShop: ' + shop + '\nURL: ' + url + '\nresolved: ' + r.url + '\nnoQuery: ' + clean_url
                else:
                    message = 'Name: ' + hwName + '\nPreis: ' + preis + '\nShop: ' + shop + '\nURL: ' + clean_url
                sendMessage(bot, channelID, message)
                if "5090" in hwName:
                    sendMessage(bot, masterRace, message)
            else:
                page = requests.get(linkStr)

                tree = html.fromstring(page.content)

                postElement = tree.xpath('//a[contains(text(), \'weiterlesen\')]')
                if len(postElement) != 1:
                    sendMessage(bot, channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln')
                    return end(-1, browser)
                postElementStr = str(htmlstring(postElement[0], encoding='unicode'))

                match =  re.search(r'href\="([^"]+).*', postElementStr)
                if match == None:
                    sendMessage(bot, channelID, 'Bot kaputt: Kann Hardwarename nicht capturen.')
                    continue
                url = match.group(1)
                sendMessage(bot, channelID, 'News: ' + url)
        return end(0, browser)

    except Exception as e:
        browser.quit()
        sendMessage(bot, channelID, str(e))

if __name__ == "__main__":
    main()