Trước có crawl thằng này 1 lần, code tham khảo cho ai viết bằng py Code: import os import slugify import traceback from bs4 import BeautifulSoup, NavigableString, Comment from ..lib.web import get_html_content, download_image from import Media class Eva(object): def __init__(self, url, site_domain): self.site_domain = site_domain self.url = url self.html() def html(self): """ Return html content of given url """ self.html = get_html_content(self.url) return self.html def post_title(self): soup = BeautifulSoup(self.html, 'html.parser') title = soup.find("meta", property="og:title") if title: return title.get('content') else: return '' def post_description(self): soup = BeautifulSoup(self.html, 'html.parser') description = soup.find("meta", property="og:description") if description: return description.get('content') else: return '' def post_body(self, format='raw'): """ raw = list of raw html element (splitted by '\n') text = only text, no html tag """ img_name = '' img_count = 0 title = self.post_title() if title: img_name = slugify.slugify(title) try: soup = BeautifulSoup(self.html, 'html.parser') body = soup.find("div", class_="text-conent") if not body: return 's' if format == 'raw': raw_body = "" # remove js scripts in the body [s.extract() for s in body('script')] # remove related post soup.find('div', class_="baiviet-bailienquan").decompose() soup.find('div', class_="baiviet-bailienquan-bottom").decompose() for content in body.contents: if type(content) is Comment or type(content) is NavigableString: continue else: if content.find("img"): for img in content.find_all('img', class_="news-image"): img_count += 1 img_new_name = "{0}-{1}{2}".format(img_name, img_count, os.path.splitext(img['src'])[1]) downloaded_img_link = download_image(url=img['src'], name=img_new_name) if downloaded_img_link: new_img_link = img['src'] = self.site_domain + '/' + downloaded_img_link # save media info to db. we don't need thumbnail media = Media(uri=downloaded_img_link, thumbnail_uri=downloaded_img_link, caption=os.path.basename(downloaded_img_link)) if media.add(): # # create new img tag instead of using the original one new_img_tag = '<p style="text-align: center; "><img src="{0}"/></p>'.format(new_img_link) raw_body += new_img_tag else: raw_body += str(content) return raw_body elif format == 'text': return body.text except: print (traceback.format_exc()) return ''