Init

2021-05-04 19:02:38 +02:00
parent 0fd9d69d6c
commit 8b5b2bd887
3 changed files with 394 additions and 0 deletions
--- a/test.py
+++ b/test.py
@@ -0,0 +1,200 @@
+# coding: utf-8
+
+from telegram.ext import Updater
+from lxml import html
+import requests
+from lxml.etree import tostring as htmlstring
+
+import time
+import re
+
+from dotenv import dotenv_values
+
+from selenium import webdriver
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import TimeoutException
+
+linkTemplate = '/html[1]/body[1]/div[2]/div[4]/div[1]/div[1]/div[2]/div[1]/ul[1]/li[2]/div[1]/div[1]/ol[1]/li[{index}]/div[1]/div[1]/h3[1]/a[1]'
+
+def getNewContent():
+    fireFoxOptions = webdriver.FirefoxOptions()
+    fireFoxOptions.headless = True
+    browser = webdriver.Firefox(options=fireFoxOptions)
+
+    browser.get('https://www.hardwareluxx.de/community/members/hwl-news-bot.268095/#recent-content')
+
+    try:
+        WebDriverWait(browser, 120).until(EC.presence_of_element_located((By.XPATH, linkTemplate.format(index='1'))))
+        return browser
+    except TimeoutException:
+        return -1
+
+def getLinksForID(browser, id):
+    elements = browser.find_elements_by_xpath(linkTemplate.format(index=(str(id))))
+    links = [elem.get_attribute('href') for elem in elements]
+
+    if len(links) != 1:
+        #TODO Fehler
+        return -1
+    return links[0]
+    
+countStrPath = 'countStr.txt' #TODO
+
+def checkForUpdate():
+    page = requests.get('https://www.hardwareluxx.de/community/members/hwl-news-bot.268095/#recent-content')
+
+    tree = html.fromstring(page.content)
+
+    count = tree.xpath('//dt[contains(text(),\'Beiträge\')]/../dd/a/text()')
+
+    if len(count) != 1:
+        return -1
+
+
+    countStr = count[0].replace('\n', '')
+    countStr = countStr.replace('.', '')
+
+    count = int(countStr)
+
+    line = ''
+
+    with open (countStrPath, 'r+') as countStrFile:
+        line = countStrFile.readline()
+
+    if line == '':
+        fileCount = -1
+    else:
+        fileCount = int(line)
+
+    if count != fileCount:
+        with open (countStrPath, 'w') as countStrFile:
+            countStrFile.write(countStr)
+        if (count - fileCount) < 0:
+            return 0
+        else:
+            return count - fileCount
+    return 0
+    
+
+def main():
+    
+    ret = checkForUpdate()
+
+    if ret == 0:
+        return 0
+
+
+    config = dotenv_values(".env")
+
+
+    token = config["TOKEN"]
+    channelID = config["CHANNELID"]
+
+    updater = Updater(token)
+
+    if (ret < 0):
+        #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann static Content nicht laden')
+        print('Bot kaputt: Kann static Content nicht laden')
+        return -1
+    
+    browser = getNewContent()
+    if browser == -1:
+        #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann dynamic Content nicht laden')
+        print ('Bot kaputt: Kann dynamic Content nicht laden')
+    
+    links = []    
+    for index in range(ret):
+        link = getLinksForID(browser, index + 1)
+        if link == -1:
+            #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann Links nicht laden')
+            print ('Bot kaputt: Kann Links nicht laden')
+            return -1
+        links.insert(0, link)
+    
+    for link in links:
+        linkStr = str(link)
+        post = re.findall(r'post-[0-9]*$', linkStr)
+        if len(post) > 1:
+            #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann Post-ID nicht ermitteln')
+            print ('Bot kaputt: Kann Post-ID nicht ermitteln')
+            return -1
+        if len(post) == 1:
+            postID = post[0][5:]
+
+            #print(post[0][5:])
+            linkToPost = 'https://www.hardwareluxx.de/community/posts/' + postID + '/'
+
+            page = requests.get(linkToPost)
+
+            tree = html.fromstring(page.content)
+
+            postElement = tree.xpath('//article[@id=\'js-post-'+ postID + '\']/div/div[2]/div/div/div/article/div[1]')
+            if len(postElement) != 1:
+                #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln')
+                print ('Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln')
+                continue
+            postElementStr = str(htmlstring(postElement[0], encoding='unicode'))
+            #print(postElementStr)
+            #updater.bot.sendMessage(channelID, postElementStr)
+            
+            match =  re.search(r'.*Name: (.*)<br.*', postElementStr)
+            if match == None:
+                #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann Hardwarename nicht capturen.')
+                print ('Bot kaputt: Kann Hardwarename nicht capturen.')
+                continue
+            hwName = match.group(1)
+
+            match =  re.search(r'.*Shop: (.*)<br.*', postElementStr)
+            if match == None:
+                #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann Shop nicht capturen.')
+                print ('Bot kaputt: Kann Shop nicht capturen.')
+                continue
+            shop = match.group(1)
+
+            match =  re.search(r'.*Preis: (.*)<br.*', postElementStr)
+            if match == None:
+                #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann Shop nicht capturen.')
+                print ('Bot kaputt: Kann Shop nicht capturen.')
+                continue
+            preis = match.group(1)
+
+            match =  re.search(r'href\="([^"]+).*', postElementStr)
+            if match == None:
+                #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann Hardwarename nicht capturen.')
+                print ('Bot kaputt: Kann URL nicht capturen.')
+                continue
+            url = match.group(1)
+            
+            message = 'Name: ' + hwName + '\nPreis: ' + preis + '\nShop: ' + shop + '\nURL: ' + url
+            print(message)
+            #updater.bot.sendMessage(channelID, message)
+        else:
+            page = requests.get(linkStr)
+
+            tree = html.fromstring(page.content)
+
+            postElement = tree.xpath('//a[contains(text(), \'weiterlesen\')]')
+            if len(postElement) != 1:
+                #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann Post für ID=' + postID + ' nicht ermitteln')
+                print ('Bot kaputt: Kann Post für News=' + linkStr + ' nicht ermitteln')
+                return -1
+            postElementStr = str(htmlstring(postElement[0], encoding='unicode'))
+
+            match =  re.search(r'href\="([^"]+).*', postElementStr)
+            if match == None:
+                #updater.bot.sendMessage(channelID, 'Bot kaputt: Kann Hardwarename nicht capturen.')
+                print ('Bot kaputt: Kann URL zu News nicht capturen.')
+                continue
+            url = match.group(1)
+            print('News: ' + url)
+            #updater.bot.sendMessage(channelID, 'News: linkStr')
+
+        
+    
+    
+    #updater.bot.sendMessage(channelID, 'https://www.hardwareluxx.de/community/members/hwl-news-bot.268095/#latest-activity')
+
+if __name__ == "__main__":
+    main()