import requests
import time
import requests
import json
import pandas as pd
import datetime
import re
from pytz import timezone
import pytz
import os
from bs4 import BeautifulSoup
from v3_elasticId_gen.elasticId2 import os_setup
from opensearchpy import OpenSearch, RequestsHttpConnection,exceptions
from requests_aws4auth import AWS4Auth
import urllib.parse
import threading
import difflib
from urllib.parse import urlencode
from pytz import country_timezones
import concurrent.futures
import filtern
import numpy
import sys
from Crypto.Hash import keccak
import dirtyjson
import traceback
import pathlib
from pathlib import Path
from configparser import ConfigParser
import db_update
from v3_filter import filter
import urllib.parse

no_crawl=['007RRH-E','077MWL-E','0B8XNL-E','0GB43W-E']

config = ConfigParser()
config.read(Path.cwd() / "config.ini")

start_time_crawl_db = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime())

last_ts=1577836800
headers="""Host: static.reuters.com
User-Agent: Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:76.0) Gecko/20100101 Firefox/76.0
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate, br
DNT: 1
Connection: keep-alive
Referer: https://in.reuters.com/
TE: Trailers"""
header_dict=dict([[h.partition(':')[0], h.partition(': ')[2]] for h in headers.split('\n')])

time_start_run=datetime.datetime.utcnow()
def convert_timedelta(duration):
    days, seconds = duration.days, duration.seconds
    hours = days * 24 + seconds // 3600
    minutes = (seconds % 3600) // 60
    seconds = (seconds % 60)
    return hours, minutes, seconds
log_file = time.ctime().replace(" ","_").replace(":","_")+'.txt'
if (Path.cwd() / "logs").exists() == False:
            (Path.cwd() / "logs").mkdir()

f = open(Path.cwd() / "logs" / log_file,"w")
f.write("-----------------------------------------\n")
f.write("Scraping on : "+log_file+" \n")
f.write("-----------------------------------------\n")
f.write("\n")
f.close()

time_start_run=datetime.datetime.utcnow()
cwd = Path.cwd()

def get_links(start_thread,total_threads,ml):
    try:
        page_no=start_thread+1
        overall_retry_count=0
        retry_count=0
        extra_articles=0
        extra_limit=10
        break_flag=0
        continue_flag=0
        no_search_flag=0
        global_retries=0
        search_term=urllib.parse.quote_plus(keyw['alias'][ml].lower().strip())
        while True:
            if keyw['entity_id'][ml] in no_crawl:
                break
            while True:
                try:
                    link_data=session.get("https://reuters.com/assets/searchArticleLoadMoreJson?blob="+search_term.replace(" ","+")+"&bigOrSmall=big&articleWithBlog=true&sortBy=date&dateRange=all&numResultsToShow=50&pn="+str(page_no)+"&callback=addMoreNewsResults").content.decode('UTF-8')
                    link_dict=dirtyjson.loads(link_data.split("ults( ")[1].split("]} );")[0]+"]}")
                    break
                except requests.exceptions.ConnectionError:
                    print("No Connection Retrying.")
                    time.sleep(0.2)
                    continue
                except:
                    time.sleep(0.2)
                    print(traceback.format_exc())
                    retry_count=retry_count+1
                    if(retry_count>5):
                        no_search_flag=1
                        break
            if(no_search_flag==1):
                print("No search")
                break
            if(link_dict['news']==[]):
                break
            if(link_dict['news'][0]["id"]==keyw['last_id'][ml] and page_no==1):
                break
            if(page_no>100):
                print("Max pages exceeded",page_no)
                break
            if(no_search_flag==1):
                global_retries=global_retries+1
                page_no=page_no+total_threads
                if global_retries>5:
                    break
                continue



            if(link_dict['totalResultNumber']==0):
                break
            continue_flag=0
            for k in range(0,len(link_dict['news'])):
                if extra_articles>=extra_limit:
                    break_flag=1
                    print(link_dict['totalResultNumber'],k,page_no)
                    
                    break
                if(int(datetime.datetime.strptime(link_dict['news'][k]["date"].replace("EDT","-0400").replace("EST","-0500"), "%B %d, %Y %I:%M%p %z").astimezone(pytz.utc).timestamp())<keyw['last_ct'][ml]):
                    extra_articles=extra_articles+1
                    
                
                title_filter=['table']
                u_flag=0
                t_flag=0
                titlen=link_dict['news'][k]["headline"].replace("<b>","").replace("</b>","")
                
                urln=link_dict['news'][k]["href"]

                
                    

                for i in range(0,len(title_filter)):
                    if(re.search('^'+title_filter[i].lower()+'+\s?\d?\s?(-|:)\s?', titlen.lower())):
                        t_flag=1
                        break
                
                if(t_flag==1):
                    continue
                #print(titlen)
                titlen = re.sub(r'^[A-Z]+\s?[A-Z]+?\s?\d?\s?(-|:)\s?', '', titlen)
                titlen = re.sub(r'^[A-Z]+\s?[A-Z]+?\s?\d?\s?(-|:)\s??', '', titlen)
                #print(titlen)
                ids[ml].append(link_dict['news'][k]["id"])
                timestamp[ml].append(int(datetime.datetime.strptime(link_dict['news'][k]["date"].replace("EDT","-0400").replace("EST","-0500"), "%B %d, %Y %I:%M%p %z").astimezone(pytz.utc).timestamp()))
                headline[ml].append(titlen)
                hreff[ml].append(link_dict['news'][k]["href"])
                blurb[ml].append(link_dict['news'][k]["blurb"].replace("\n"," ").replace("<b>","").replace("</b>",""))
                mpu[ml].append(link_dict['news'][k]["mainPicUrl"])
                isdup[ml].append(False)
                check[ml].append(False)

                if(k==len(link_dict['news'])-1 and break_flag==0):
                   continue_flag=1


            if(timestamp[ml]!=[]):
                if(timestamp[ml][-1]<1483228800):
                    break
            if(continue_flag==1):
                page_no=page_no+1
                continue       

            if(break_flag==1):
                break
            
            page_no=page_no+total_threads
    except:
        print(traceback.format_exc())
        with open(cwd/"error_logs.txt",'a') as file:
            file.write(traceback.format_exc()+'\n')
            pass

def scrape(start_thread,total_threads):
    try:
        for ss in range(len(hreff[ml])-start_thread-1,-1,(-1*total_threads)):
            if(isdup[ml][ss]==True):
                keccak_hash = keccak.new(digest_bits=256)
                keccak_hash.update(str(ids[ml][ss]).encode('utf-8'))
                idk = keccak_hash.hexdigest()
                while True:
                    try:
                        es.delete(index="n_reuters",doc_type="_doc",id=idk)
                        print("Deleted")
                        del_cont_flag=1
                        break
                    except exceptions.NotFoundError:
                        print("Not Found for delete")
                        del_cont_flag=1
                        break
                    except:
                        time.sleep(0.5)
                        with open(cwd/"error_logs.txt",'a') as file:
                            file.write(traceback.format_exc()+'\n')
                        pass
                if del_cont_flag==1:
                    print(del_cont_flag)
                    continue 
            if(timestamp[ml][ss]<=(keyw['last_ct'][ml])):
                continue
            c_hash_id=keyw['cid'][ml]
            print(keyw['entity_name'][ml],":",ss+1,"/",len(hreff[ml]))
            urll="https://reuters.com"+hreff[ml][ss]
            retry_count=0
            continue_flag=0
            max_retry_count=5
            while True:
                try:
                    loadd=(session.get(urll))
                    break
                except requests.exceptions.ConnectionError:
                    print("No Connection Retrying.")
                    time.sleep(0.5)
                    continue
                except:
                    retry_count=retry_count+1
                    if(retry_count>max_retry_count):
                        continue_flag=1
                        break
                    time.sleep(0.2)
                    
                    pass
            if continue_flag==1:
                continue
            urln=loadd.url
            url_filter=['india-morningcall/','markets-swiss-stock','article/global-','article/ifr-us-','easteurope-markets/','france-benelux-markets']
            u_flag=0
            for i in range(0,len(url_filter)):
                if(url_filter[i] in urln):
                    u_flag=1
                    break
            if(u_flag==1):
                continue
            load=loadd.content.decode('UTF-8')
            
            soup = BeautifulSoup(load, 'html.parser')
            if(soup.select('h1[class^="Headline-headline"]')==[]):
                continue
            head=soup.select('h1[class^="Headline-headline"]')[0].getText()
            head = re.sub(r'^[A-Z]+\s?[A-Z]+?\s?\d?\s?(-|:)\s?', '', head)
            head = re.sub(r'^[A-Z]+\s?[A-Z]+?\s?\d?\s?(-|:)\s?', '', head)
            search=soup.select('h1[class^="Headline-headline"]')[0].getText().lower()
            auth=soup.select('div[class^="Attribution-attribution"] > p')[0].getText() if soup.select('div[class^="Attribution-attribution"] > p')!=[] else "NULL"

            badge=soup.select('div.StandardArticleBody_trustBadgeContainer')
            if(soup.select('p.Attribution_content')!=[]):
                soup.select('p.Attribution_content')[0].decompose()
            if(badge!=[]):
                badge[0].decompose()
            for obj in soup.select('.Image_caption'):
                obj.decompose()
            ttype="article"
            #print(soup.select('article[class^="ArticlePage-article-body"]'))
            if(soup.select('article[class^="ArticlePage-article-body"] p[class*="Paragraph-paragraph"]')==[]):
                if(soup.select('pre')==[]):
                    body=""
                else:                    
                    body=soup.select('pre')[0].getText()
                    ttype="table"
            else:        
                body='\n'.join([i.getText() for i in soup.select('article[class^="ArticlePage-article"] p[class^="Paragraph-paragraph"]')])

            
            #print("body",body)
            filter_pass=filter.filter(body,[keyw['entity_id'][ml]])

            if(timestamp[ml][ss]>latest_ct[0]):
                latest_ct[0]=timestamp[ml][ss]
                latest_id[0]=ids[ml][ss]
            
            if(filter_pass==False):
                continue
            
            ret=filter_pass
            
            keccak_hash = keccak.new(digest_bits=256)
            keccak_hash.update(str(ids[ml][ss]).encode('utf-8'))
            idk = keccak_hash.hexdigest()
            
            if(len(body)>260):
                loc=body.find(' ',250,260)
                if(loc!=-1):
                    article_desc=body[0:loc]
                else:
                    article_desc=body[0:250]
            else:
                article_desc=body
            cont_flag=0

            data_array=[keyw['country'][ml],keyw['sector'][ml],keyw['industry'][ml]]
            data_array=[da.replace(' ','_').lower() for da in data_array]
            
            while True:
                try:
                    match_data=es.get(index="n_reuters",doc_type="_doc",id=idk)
                    
                    if(keyw['entity_name'][ml] not in (match_data["_source"]["company_name"])):
                        
                        
                        r = match_data["_source"]["r"]
                        n = match_data["_source"]["n"]
                        ns = match_data["_source"]["ns"]
                        nss = match_data["_source"]["nss"]
                        tags = match_data["_source"]["tags"]
                        

                        if ret[0]:
                            r.append(c_hash_id)
                            for da in data_array:
                                if da not in r:
                                    r.append(da)

                        if ret[1]:
                            n.append(c_hash_id)
                            for da in data_array:
                                if da not in n:
                                    n.append(da)

                        if ret[2]:
                            ns.append(c_hash_id)
                            for da in data_array:
                                if da not in ns:
                                    ns.append(da)
                                    
                        if ret[3]:
                            nss.append(c_hash_id)
                            for da in data_array:
                                if da not in nss:
                                    nss.append(da)

                        ta = ret[4]
                        tags = ret[5]
                        tags_common = ret[6]
                        es.update(index='n_reuters',doc_type='_doc',id=idk,
                                body= {
                                        "script" : {
                                        "source": "ctx._source.company_name.add(params.company_name);ctx._source.entity_id.add(params.entity_id);ctx._source.sector.add(params.sector);ctx._source.industry.add(params.industry);ctx._source.debt_listing.add(params.debt_listing);ctx._source.usd_sales.add(params.usd_sales);ctx._source.country.add(params.country);ctx._source.tags.add(params.tags);ctx._source.r = params.r;ctx._source.n = params.n;ctx._source.ns = params.ns;ctx._source.nss = params.nss;",
                                        "lang": "painless",
                                        "params" : {
                                            "company_name" : keyw['entity_name'][ml],
                                            "entity_id" : keyw['entity_id'][ml],
                                            "country" : keyw['country'][ml],
                                            "debt_listing" : keyw['debt_listing_exchange'][ml],
                                            "usd_sales" : keyw['usd_sales'][ml],
                                            "sector" : keyw['sector'][ml],
                                            "industry" : keyw['industry'][ml],
                                            "tags":filter_pass[5],
                                            "r": r,
                                            "n": n,
                                            "ns": ns,
                                            "nss": nss
                                        }
                                        }
                                        }
                                    )
                        print("Updated")
                        keyw.at[ml,'last_pt']=timestamp[ml][ss]
                        keyw.at[ml,'last_id']=ids[ml][ss]
                        keyw.to_csv(cwd/"final.csv",index=False,encoding='latin1')
                        updates[0]=updates[0]+1
                        cont_flag=1
                        
                        break
                    else:
                        print("Exists")
                        cont_flag=1
                        break
                except exceptions.RequestError:
                    print("Request Error")
                except exceptions.NotFoundError:
                    break
                except:
                    time.sleep(0.5)
                    pass
            if cont_flag==1:
                continue
                             
                
            data_dict = {
                        "headline":head,
                        "main_article":body,
                        "published_timestamp":timestamp[ml][ss],
                        "source_url":loadd.url,
                        "source" : "Reuters",
                        "ISIN": "NULL",
                        "sector":[keyw['sector'][ml]],
                        "category":"NULL",
                        "author" :auth,
                        "entity_id" : [keyw['entity_id'][ml]],
                        "debt_listing" : [keyw['debt_listing_exchange'][ml]],
                        "id_on_source" : ids[ml][ss],
                        "sentiment" : "NULL",
                        "industry" : [keyw['industry'][ml]],
                        "type" : ttype,
                        "thumb" : mpu[ml][ss],
                        "company_name":[keyw['entity_name'][ml]],
                        "country":[keyw['country'][ml]],
                        "usd_sales":[keyw['usd_sales'][ml]],
                        "tags":[filter_pass[5]],                                                  
                        "tags_common":filter_pass[6],
                        "article_desc"              : article_desc,
                        "r"                         : [c_hash_id]+data_array if ret[0] else [],
                        "n"                         : [c_hash_id]+data_array if ret[1] else [],
                        "ns"                        : [c_hash_id]+data_array if ret[2] else [],
                        "nss"                       : [c_hash_id]+data_array if ret[3] else [],
                        "ta"                        : ret[4],
                                        }
            while (True):
                try:
                    es.index(index="n_reuters", doc_type="_doc",id=idk, body=data_dict)
                    keyw.at[ml,'last_pt']=timestamp[ml][ss]
                    keyw.at[ml,'last_id']=ids[ml][ss]
                    keyw.to_csv(cwd/"final.csv",index=False,encoding='latin1')
                    updates[0]=updates[0]+1
                    print("New")
                    break
                except:
                    print(sys.exc_info())
                    time.sleep(1)
                    pass
    except:
        print(traceback.format_exc())
        with open(cwd/"error_logs.txt",'a') as file:
            file.write(traceback.format_exc()+'\n')
            pass

es = os_setup()

keyw=pd.read_csv(cwd/'final.csv',encoding='latin1')

with open(cwd/'rdata.txt','r') as file:
    rdata=file.read()

range1=rdata.split('-')[0]
range2=rdata.split('-')[1]
if range2=='max':
    range2=len(keyw['entity_id'])
    range1=int(range1)
else:
    range2=int(range2)
    range1=int(range1)
session=requests.Session()

timestamp=[[] for i in range(0,len(keyw['cid']))]
ids=[[] for i in range(0,len(keyw['cid']))]
tz=[[] for i in range(0,len(keyw['cid']))]
tzz=[[] for i in range(0,len(keyw['cid']))]
headline=[[] for i in range(0,len(keyw['cid']))]
hreff=[[] for i in range(0,len(keyw['cid']))]
blurb=[[] for i in range(0,len(keyw['cid']))]
mpu=[[] for i in range(0,len(keyw['cid']))]
isdup=[[] for i in range(0,len(keyw['cid']))]
check=[[] for i in range(0,len(keyw['cid']))]
def get_all_links(start_thread,total_threads):
    
    for ml in range(range1+start_thread,range2,total_threads):
        print(keyw['entity_name'][ml],ml)
        get_links(0,1,ml)

print(range1,range2)


total_workers=5
with concurrent.futures.ThreadPoolExecutor(max_workers=total_workers) as executor:
    executor.map(get_all_links, range(0,total_workers),[total_workers]*total_workers)

for ml in range(range1,range2):
    start_company=int(datetime.datetime.utcnow().timestamp())
    latest_ct=[keyw['last_ct'][ml]]
    latest_id=[keyw['last_id'][ml]]
    updates=[0]
    count=0
    search_term=keyw['alias'][ml]
    """
    ids=[]
    datem=[]
    dated=[]
    datey=[]
    datet=[]
    datetz=[]
    headline=[]
    hreff=[]
    blurb=[]
    mpu=[]
    timestamp=[]
    tz=[]
    tzz=[]
    check=[]
    isdup=[]
    """
    j=1
    tss=2483228800
    tp=100
    totall=0
    allc=0
    titlen=""
    """
    total=5
    with concurrent.futures.ThreadPoolExecutor(max_workers=total) as executor:
        executor.map(scrape_links, range(total),[total]*total)
    timestamp_sort=numpy.array(timestamp)

    """
    print(keyw['entity_name'][ml],ml)
    
    """
    timestamp_sort=numpy.array(timestamp)
    
    inds = (-timestamp_sort).argsort()
    timestamp=list(numpy.array(timestamp)[inds])
    timestamp=[i.item() for i in timestamp]
    ids=list(numpy.array(ids)[inds])
    tz=list(numpy.array(tz)[inds])
    tzz=list(numpy.array(tzz)[inds])
    headline=list(numpy.array(headline)[inds])
    hreff=list(numpy.array(hreff)[inds])
    blurb=list(numpy.array(blurb)[inds])
    mpu=list(numpy.array(mpu)[inds])
    """

    
    
    if(len(timestamp[ml])==0):
        continue
    if(ids[ml][0]==keyw['last_id'][ml]):
        continue
    for i in range(0,len(headline[ml])-10):
        for j in range(1,8):
            try:
                if((timestamp[ml][i]-timestamp[ml][j+i]<43200) and (difflib.SequenceMatcher(None,headline[ml][i],headline[ml][j+i]).ratio()>=0.7)):
                    isdup[ml][i+j]=True
            except:
                pass
    
    head=[]
    body=[]
    date=[]
    thumbnail=[]
    url=[]
    source=[]
    isin=[]
    sector=[]
    author=[]
    symbol=[]
    exchange=[]
    id=[]
    sentiment=[]

    views=[]
    comments=[]
    count=0
    total=5
    emp_array=[]
    threads=[]
               

    total=5
    
    with concurrent.futures.ThreadPoolExecutor(max_workers=total) as executor:
        executor.map(scrape, range(total),[total]*total)
    
    
    keyw.at[ml,"last_ct"] = latest_ct[0]
    keyw.at[ml,"last_id"] = latest_id[0]
    if(keyw['last_id'][ml]=='no_id_yet'):
        keyw.at[ml,"last_id"] = ids[ml][0] if ids[ml]!=[] else 'no_id_yet'
    
    
    if(updates[0]>0):
        print("Updates Found")
        with open(Path.cwd() / "logs" / log_file,"a") as f:
            f.write(keyw['entity_name'][ml]+" ")
            f.write(str(updates[0])+" Updates")
            f.write("\n")
            
    with open(cwd/'last.txt','w') as file:
        file.write(str(ml))
keyw.to_csv(cwd/'final.csv',encoding='latin1',index=False)    
time_end_run=datetime.datetime.utcnow()
print(time_end_run)
hours, minutes, seconds = convert_timedelta((time_end_run-time_start_run))    
with open(Path.cwd() / "logs" / log_file,"a") as f:
    f.write("\n")
    f.write("-----------------------------------------\n")
    f.write("Took : "+'{} hours {} minutes {} seconds'.format(hours,minutes,seconds)+"\n")
    f.write("-----------------------------------------\n")
    
db_update.updateCrawlerStatus(host=config['database']['host'],
        user=config['database']['user'],
        password=config['database']['password'],
        db=config['database']['db'],
        crawler_id=config['database']['id'],
        dev_owner=config['database']['dev_owner'],
        update_user=config['database']['update_user'],
        scheduled_time=start_time_crawl_db,
        scheduled_timezone="GMT",
        active_flag=1,
        crawler_name="Reuters")