Cần leech content chuyên mục "Du lịch" web "eva.vn" về wp (sync ảnh lên imgur)

Discussion in 'Chợ Việc Làm MMO' started by gu gồ, Nov 2, 2017.

  1. gu gồ

    gu gồ Administrator Staff Member

  2. Luxifer

    Luxifer Sơ Nhập Giang Hồ

    Trước có crawl thằng này 1 lần, code tham khảo cho ai viết bằng py

    Code:
    import os
    import slugify
    import traceback
    
    from bs4 import BeautifulSoup, NavigableString, Comment
    from ..lib.web import get_html_content, download_image
    from app.mod_be.models.media import Media
    
    class Eva(object):
        def __init__(self, url, site_domain):
            self.site_domain = site_domain
            self.url = url
            self.html()
    
        def html(self):
            """
            Return html content of given url
            """
            self.html = get_html_content(self.url)
            return self.html
    
        def post_title(self):
            soup = BeautifulSoup(self.html, 'html.parser')
            title = soup.find("meta",  property="og:title")
    
            if title:
                return title.get('content')
            else:
                return ''
    
        def post_description(self):
            soup = BeautifulSoup(self.html, 'html.parser')
            description = soup.find("meta",  property="og:description")
    
            if description:
                return description.get('content')
            else:
                return ''
    
        def post_body(self, format='raw'):
            """
            raw = list of raw html element (splitted by '\n')
            text = only text, no html tag
            """
            img_name = ''
            img_count = 0
            title = self.post_title()
            if title:
                img_name = slugify.slugify(title)
    
            try:
                soup = BeautifulSoup(self.html, 'html.parser')
                body = soup.find("div", class_="text-conent")
    
                if not body:
                    return 's'
    
                if format == 'raw':
                    raw_body = ""
                    # remove js scripts in the body
                    [s.extract() for s in body('script')]
    
                    # remove related post
                    soup.find('div', class_="baiviet-bailienquan").decompose()
                    soup.find('div', class_="baiviet-bailienquan-bottom").decompose()
    
                    for content in body.contents:
                        if type(content) is Comment or type(content) is NavigableString:
                            continue
                        else:
                            if content.find("img"):
                                for img in content.find_all('img', class_="news-image"):
                                    img_count += 1
                                    img_new_name = "{0}-{1}{2}".format(img_name, img_count, os.path.splitext(img['src'])[1])
                                    downloaded_img_link = download_image(url=img['src'], name=img_new_name)
                                    if downloaded_img_link:
                                        new_img_link = img['src'] = self.site_domain + '/' + downloaded_img_link
    
                                        # save media info to db. we don't need thumbnail
                                        media = Media(uri=downloaded_img_link, thumbnail_uri=downloaded_img_link, caption=os.path.basename(downloaded_img_link))
                                        if media.add():
                                            # # create new img tag instead of using the original one
                                            new_img_tag = '<p style="text-align: center; "><img src="{0}"/></p>'.format(new_img_link)
                                            raw_body += new_img_tag
                            else:
                                raw_body += str(content)
                    return raw_body
    
                elif format == 'text':
                    return body.text
            except:
                print (traceback.format_exc())
                return ''
    
     
    Hau Nguyen, Vajimi and saivnn like this.
  3. Nai

    Nai MiddleMan Staff Member

    Ủa, sao bác Gu gồ mà cũng cần nhờ người code nữa à. Thật là vô lý, phi lí, bất hợp lý
     
  4. gu gồ

    gu gồ Administrator Staff Member

    ối giời ơi, em có biết code đâu bác =))
     
  5. gu gồ

    gu gồ Administrator Staff Member

    mù py luôn kakaka
     
  6. Nai

    Nai MiddleMan Staff Member

    Thế bác đang làm Mod của box nào thế {big_smile}
     
  7. gu gồ

    gu gồ Administrator Staff Member

    cũng ko biết nữa, lão Khang lầy set box nào đó ko biết =))
     
  8. _nEwbiE

    _nEwbiE Sơ Nhập Giang Hồ

    em làm được bác nhé, contact ngay fb em nói chuyện cho dễ