# -*- coding: UTF-8 -*-
import requests
from bs4 import BeautifulSoup
from lxml import etree
import pymysql
import datetime

try:
    connection = pymysql.connect(
        host='127.0.0.1',
        db='richbs5_farmdb',
        user='richbs5_farmadmin',
        passwd='Richbs50080',
        charset='utf8mb4',
    )
    cursor = connection.cursor()

    headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.0 (KHTML, like Gecko) Chrome/4.0.206.0 Safari/532.0'}

    response = requests.get('https://www.economic.taichung.gov.tw/1492480/Nodelist', headers=headers, timeout=30)

    soup = BeautifulSoup(response.text, "html.parser")
    dom = etree.HTML(str(soup))
    articles = dom.xpath('//*[@id="center"]/section[@class="np"]//li/a')

    select_qry = "SELECT `article_url` FROM `articles` WHERE `article_type` = 2 AND `deleted_at` IS NULL"
    cursor.execute(select_qry)
    article_urls = [item[0] for item in cursor.fetchall()]

    for article in articles:
        url = article.xpath('@href')[0]
        if 'https://' not in url:
            url = 'https://www.economic.taichung.gov.tw' + url

        if url in article_urls:
            continue

        response = requests.get(url, headers=headers, timeout=30)
        soup = BeautifulSoup(response.text, "html.parser")

        article_content = ''
        article_attachment = ''
        article_title = article.xpath('text()')[0]
        flag = False

        if 'https://www.economic.taichung.gov.tw' not in url or url[-4:] == '.pdf':
            flag = True
        else:
            dom = etree.HTML(str(soup))

            if dom.xpath('//*[@id="cpArticle"]'):
                article_soup = soup.find('article', id='cpArticle')
                flag = True

            if dom.xpath('//*[@id="center"]/section[@class="list"]'):
                article_soup = soup.find('section', class_='list')
                flag = True

            if article_soup:
                article_content = article_soup\
                    .prettify()\
                    .replace('\n', '')\
                    .replace('<article class="cpArticle" id="cpArticle">', '<div>')\
                    .replace('</article>', '</div>')\
                    .replace('&amp;', '&')\
                    .replace('&quot;', '"')\
                    .replace('&lt;', '<')\
                    .replace('&gt;', '>')

                for img in article_soup.find_all('img'):
                    src = img.get('src')
                    if src and 'https://' not in src:
                        article_content = article_content.replace(src, 'https://www.economic.taichung.gov.tw' + src)

                for a in article_soup.find_all('a'):
                    href = a.get('href')
                    if href and 'https://' not in href:
                        article_content = article_content.replace(href, 'https://www.economic.taichung.gov.tw' + href)

            attachment_soup = soup.find('section', class_='attachment')
            if attachment_soup:
                article_attachment = attachment_soup.prettify().replace('\n', '')

                for a in attachment_soup.find_all('a'):
                    href = a.get('href')
                    if href and 'https://' not in href:
                        article_attachment = article_attachment.replace(href, 'https://www.economic.taichung.gov.tw' + href)

        if flag:
            insert_qry = "INSERT INTO `articles` (`article_type`, `article_url`, `article_category_id`, `article_title`, `article_content`) VALUES (%s, %s, %s, %s, %s)"
            cursor.execute(insert_qry, (2, url, 1, article_title, article_content + article_attachment))

            connection.commit()
except Exception as e:
    with open('log.txt', 'a') as f:
        f.write("%s %s\n" % (datetime.datetime.now(), e))
finally:
    with open('log.txt', 'a') as f:
        f.write("%s 執行了爬蟲\n" % datetime.datetime.now())

    connection.close()