3
|gbI                 @   st  d dl mZmZmZ d dlmZ d dlmZ d dl	Z	d dl
Z
d dlZd dlZd dlZd dlZd dlZd dlZd dlmZ d dlmZ d dlZd dlmZ d dlmZ d d	lmZ d d
lmZ d dlm Z  ej Z!e Z"e"j#d e j$e"d d e"d d  ej% j&ddZ'ej( d j) dkr2ej( d j*  e+ej( d e' dZ,e,j-d e,j-d e,j-de' d  e,j-d e,j.  e"d d Z/e"d d Z0e"d d Z1e"d d Z2e"d d Z3ee2e3e0e1Z4ej5 Z6e"d d Z7ddd d!d"d#Z8ej( e"d$ d%  Z9d&d'd&d&d(d&d&d)d*d+d,d-d&d-d-dd.Z:ej;e9Z<da=d/d0 Z>e>d1d2 Z?e+d3d42Z,e@e,j# ZAe,jBd  e,j-eCeAd5 d6  W dQ R X eAd krd ZDd7ZEn:eAd5krd7ZDd8ZEn&eAd9krd8ZDd:ZEneAd;krd:ZDdFZEejFjGd5d<ZHd=d> e<jId7d8 D ZJW dQ R X t=d?kr,eKd@tL eHjMd?dA e+ej( d e' dB(Z,e,j-dCeCej e! dD  dE  W dQ R X dS )G    )
OpenSearchRequestsHttpConnection
exceptions)AWS4Auth)BeautifulSoupN)date)ConfigParser)randint)Path)filter)
elasticId2)	translatez
config.iniZelasticSearch	access_id
secret_key _logsFwz

z*-----------------------------------------
zScraping on : z -
hostregionserviceDEFAULTZ
query_link1zhMozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.61 Safari/537.36z|text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9zno-cachez$fe0ce6f0-2728-0a38-7539-9d574f047042)zupgrade-insecure-requestsz
user-agentacceptzcache-controlzpostman-tokenZcompanyListZsource_file newsallZ20z2020-01-01 00:00:00z2020-12-31 23:59:59Z20200)qcfromcolrangesourcecountrysizeZstimeZetimetimeZdpcaZpsZpfpagec                s    fdd}|S )Nc           	      s(   y
 | |S    t j adaY nX d S )NT)	traceback
format_excreExcetoExit)argskwargs)f )/home/ubuntu/scraper_sina/scraper_sina.pymodified<   s
    
zexec.<locals>.modifiedr0   )r/   r2   r0   )r/   r1   exec:   s    r3   c       %   =   C   sR  t sHddd tj| df jdD krLddd tj| df jdD krLd S tj| df tj| df tj| d	f tj| d
f tj| df tj| df tj| df tj| df tj| df tj| df g
}|d td< g }d}d}d}x|rd}xlytjtttdd}P W q t	k
rZ } z0t
d |d7 }|dkr>|tjd wW Y d d }~X qX qW t|jd}	|	jdddir|dkrtj|	jdddij}
tjd|
}ttt|}t|d }
|
dkrP x|	jdddijdddiD ]|}|jd dd!ijjd"}|d d" |d  }ttjj|d$j }|d%ksN||d& k rTd'}P |j|jd(jd) qW t||
ks|t|krP t|}|d7 }|td*< qt
d+tt d S qW t
d,| d\}}
x|d d d D 
]}d-|kr|d7 }qd}xlytj|dd.}P W nP t	k
rf } z2t
d |d7 }|dkrH|tjd wW Y d d }~X nX qW t|jd}	d\}}d0|krtjd|d }y|	jd1dd2ij}W n   |	jdd3d4ij}Y nX n.d5|ksd6|ksd7|ksd8|kry|	jd9d:d;ijd<}W n   |d7 }wY nX y:x*|	jddd=ijd>D ]}||j7 }qLW tj|}W n   |d7 }wY nX n|d?|kr|	jd9d:d@ijd<}x*|	jdddAijd>D ]}||j7 }qW tj|}n$dB|kr|d7 }qn|d7 }qtj|}|dC |d |d# g}dDd |D }yd}xhyt jdEdF|dGdH}P W nH t	k
r } z*|d7 }|dkr~|tjd w<W Y d d }~X nX q<W |dI |dJ dK kr|dJ dL }t!j!||dM g}|d'kr|dJ dN }|dJ dO }|dJ dP }|dJ dQ }|d rN|j|dR  x"|D ]}||kr0|j| q0W |d r|j|dR  x"|D ]}||krl|j| qlW |d# r|j|dR  x"|D ]}||kr|j| qW |dM r|j|dR  x"|D ]}||kr|j| qW |d }n|d7 }wd}xydt j"dEdF|dGdSdTdU|dI |dM |d |d# ||dC |dV |d ||||dWdXidY t
dZ P W nJ t	k
r } z,|d7 }|dkr|tjt#d#dC W Y d d }~X nX qW |d7 }t
d[ wW n& t$j%k
r } zW Y d d }~X nX d\|d\d\d]d\g d\d\d\g d\d\g g d^g d/|g d/d/d/g g g g d/g g d_} d0|k	rvy|	jd`ddaij}!W n$   |	jdddbijd`j}!Y nX y0|	jd9dcddijd<}ttjj|d$j }W n   x0|	jdddeijd D ]}"df|"kr|"}P qW t&|t&dgk	r\|d |j'd" }yttjj|d$j }W n   |d7 }wY nX Y nX |!| dh< || di< npd5|k	sd6|k	sd8|k	sd7|k
rH|	jd`ddjij}!|	jd ddkij}tj|}yttjj|dlj }W n@   yttjj|dmj }W n   |d7 }wY nX Y nX tj|!| dh< |!| dn< || di< nd?|k
r|	jdddoijd`j}!y"t|	jd ddpijdkdG }W n:   |	jd>ddqijd j}ttjj|drj }Y nX tj|!| dh< |!| dn< || di< || ds< | d j|d  | d	 j|d#  | d
 j|dM  | dK j|dI  | d j|d  | d j|dC  | dt j|dV  || du< t!j!| dL |dM g}|d'kr|d r| dN j(|dR g|  |d r| dO j(|dR g|  |d# r| dP j(|dR g|  |dM r| dQ j(|dR g|  |dI | dv< | dw j|d  |dC | dx< t| dL dykr| dL jd"dzdy}#|#dkr| dL d|# | d{< n| dL ddz | d{< n| dL | d{< t| d| dykr| d| jd"dzdy}#|#dkr| d| d|# | d}< n| d| ddz | d}< n| d| | d}< n|d7 }qd}xyNt j)dEdF| dG|d~ t
d | di tj| df< tj*t+d'd |d7 }d}P W nJ t	k
r } z,|d7 }|dkr|tjt#d#dC W Y d d }~X nX q&W qW t,t-j. d t/ d(B}$|$j0t1| d"  |$j0|dI d"  |$j0t1| |$j0d W d Q R X ttj tj| df< tj*t+d'd d S )NZChinac             S   s   g | ]}|j  qS r0   )strip).0Zindr0   r0   r1   
<listcomp>H   s    zcrawl_alias.<locals>.<listcomp>Zcrawler_country,z	Hong Kongaliassectorindustry	entity_idZentity_name	usd_salesr$   Zdebt_listing_exchangeZlast_crawledZhash_idr   r      T   )headersparamstimeoutzRETRYING...   zhtml.parserZdivclassZl_v2z\d+resultz
box-resultspanZ
fgray_timer      z%Y-%m-%d %H:%M:%Si^   Fr'   Zhrefr(   z
No resultsztotal linkshttps)rA   r   z	//englishsectionZart_contentidZartibodyz//cj.z//k.z//tech.z
//finance.metanameZ	publishidcontentZarticlepz//jx.Zsudametazarticle-bodyz//stock   c             S   s   g | ]}|j d dj qS )r   r   )replacelower)r5   dar0   r0   r1   r6      s    Zn_sinaZ_doci  )indexdoc_typerJ   request_timeout   _sourcecompany_namemain_article   rnnsnss	   Zscripta  ctx._source.company_name.add(params.company_name);ctx._source.entity_id.add(params.entity_id);ctx._source.sector.add(params.sector);ctx._source.industry.add(params.industry);ctx._source.tags.add(params.tags);ctx._source.country.add(params.country);ctx._source.usd_sales.add(params.usd_sales);ctx._source.debt_listing.add(params.debt_listing);ctx._source.r = params.r;ctx._source.n = params.n;ctx._source.ns = params.ns;ctx._source.nss = params.nss;Zpainless   )rX   r;   r9   r:   tagsr$   debt_listingr<   r[   r\   r]   r^   )r#   langr@   )rS   rT   rJ   rU   bodyupdatedzalready existsNULLZSINAZChinese)headlinerY   published_timestamp
source_urlr#   ZISINr9   categoryZauthorZstock_symbolrb   id_on_sourceZ	sentimentr:   rX   Zsource_languager;   headline_originalmain_article_originalra   tags_commonarticle_descarticle_desc_originalr[   r\   r]   r^   tar$   r<   h1Z
art_tit_h1Ztit_04propertyzarticle:published_timeZt_attrGMTZsttrrg   rh   z
main-titler   z%d/%m/%Y %H:%Mz%m/%d, %Y %H:%Mrl   zarticle-headerr&   zsource-timez%Y-%m-%d %H:%Mri   rb   rk   rq   ra   rn   i     ro   rm   rp   )rS   rT   rd   rU   rJ   Zindexed)rS   r   
)r   r   rx   )r   r   rx   rx   rx   )2r,   csv_fileZatsplitquerystringrequestsgeturlr?   	Exceptionprintr&   sleepbsrM   findr   Zsimplified_chin_translatetextrefindalllistmapintZfindAlldatetimestrptime	timestampappendlenr   Zgenerateesr   updater	   r   ZNotFoundErrortyperfindextendrS   Zto_csv	file_pathopenr
   cwdlog_filewritestr)%ZpositionZcompanyZ	list_linkZprevious_lengthZ	pageCountflagZretry_countresespZtotalResultsZtempZ
eachResultZ	timeStampr   skipZeachLinkZoriginal_bodyrd   Z
article_idZeachPZidsZ
data_arrayZ
match_datarY   Zretr[   r\   r]   r^   rR   ra   Z	data_dictrg   Z
timeStampslocr/   r0   r0   r1   crawl_aliasF   sd   H|




"





(




















(

(


"







.r   zexec_count.txtzr+r=   rV   i	  i  rF   iL  rZ   )Zmax_workersc             C   s   g | ]}t jt|qS r0   )executorZsubmitr   )r5   jr0   r0   r1   r6     s    r6   There)waitr'   zSuccess in <   z minutesrx   )NZopensearchpyr   r   r   Zrequests_aws4authr   Zbs4r   r   r|   ZjsonZcsvr&   r   r   r)   Zconcurrent.futuresZ
concurrentr   configparserr   Zpandaspdrandomr	   pathlibr
   Z	v3_filterr   Zv3_elasticId_genr   Zv3_translater   ZstartingCodeconfigreadZ
initializectimerP   r   r   existsmkdirr   r/   r   closer   r   r   r   r   ZawsauthZos_setupr   r~   r?   r   r{   Zread_csvry   r,   r3   r   r   r   seekr   sr   ZfuturesZThreadPoolExecutorr   rS   resultsr   r+   shutdownr0   r0   r0   r1   <module>   s   @



&
  r
 



"

