import scrapy
import sys, re, codecs, shutil
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import CrawlSpider, Rule
#from scrapy.http.request import Request Don't need this anymore if not able to get fb comments
from scrapy.selector import Selector
from scrapy.selector import HtmlXPathSelector
class MySpider(CrawlSpider):
name = "buzzfeed"
allowed_domains = [r"www.buzzfeed.com"]
start_urls = [
r"http://www.buzzfeed.com/?country=fr"
]
# for testing, let's only download pages with two digits in name
# in vivo, \d\d should be replaced with .+, so as to allow all links
rules = (Rule(SgmlLinkExtractor(allow=[r'.+']),
callback='parse_item', follow=True),)
def parse_item(self, response):
filename = response.url.split("/")[-1]
filename = filename[:20] + ".xml" # file name is the first 20 characters.
lang = HtmlXPathSelector(response).select('//html/@lang').extract()[0]
title = HtmlXPathSelector(response).select("//h1[@id='post-title']/text()").extract()[0]
title = title.encode('utf-8') #was getting encoding error; this seems to fix that
#this should get the description of the article. Will account for any extraneous linking or anything.
descriptions = HtmlXPathSelector(response).select("//div[@id='buzz_header']/hgroup/p[@class='description']/descendant-or-self::text()").extract()
description = ""
for d in descriptions:
description = description + d.encode('utf-8')
description.strip()
author = HtmlXPathSelector(response).select('//a[@class="byline__author"]/text()').extract()[0]
author = author.encode('utf-8')
affiliation = HtmlXPathSelector(response).select('//div[@class="byline__title"]/text()').extract()[0]
affiliation = affiliation.encode('utf-8')
#we want to treat captions one at a time, so don't concat them as is done with description (only done to counteract any possible
#extra tags within the description
, not really different objects philosophically)
#final remaining problem: string will only return one; but if I do descendant-or-self::text() I can get several
#which are treated seperately in loop later. Don't want to concat them, because it's a whole list, and I only
#want to join the number to their corresponding text. might need to get the whole list and then concat afterwards
#using python stuff rather than pure xpath.
captionHeaders = HtmlXPathSelector(response).select('//div[@data-print="body"]/div/div/h2')
captions = list()
commentaires = dict()
for captionHeader in captionHeaders:
caption = captionHeader.xpath('string(.)').extract()[0]
captions.append(caption)
if captionHeader.xpath("../p"):
commentaires[caption] = captionHeader.xpath("../p").extract()[0]
#info will be the dict that we pass as the meta param to the next request for the Facebook plugin
info = {"filename": filename, "title": title, "description": description, "author": author, "affiliation": affiliation, "captions": captions}
#this is what we'll use to crawl to the next page to get comments, if any exist.
############
#Attempts to get the Facebook comments. Either of these *should/would* work, if the whole section was not generated
#with javascript, but it seems that since that is the case, the only way to get these comments would be to bring
#in a whole different package called Selenium.
#
# commentsPage = HtmlXPathSelector(response).select('//iframe[@class = "fb_ltr"]/@src').extract()[0]
# commentsPage = commentsPage.encode('utf-8')
# request = scrapy.Request("http://www.example.com/some_page.html",
# callback=self.parse_page2,
# meta=info)
# comments = HtmlXPathSelector(response).select('//div[@class="postText"]/descendant-or-self::text()').extract()
# return request
############
if lang == 'fr':
with open("TEST.txt", 'ab') as f:
f.write('')
f.write('\n')
f.write("")
f.write('')
f.write("\n\t")
f.write("\n\t\t")
f.write(title)
f.write("\t\t")
f.write("\n\t\t")
f.write(description)
f.write("\t\t")
f.write('\n\t')
f.write("\n\t")
f.write("\n\t\t")
f.write(author)
f.write("\t\t")
f.write("\n\t\t")
f.write(affiliation)
f.write("\t\t")
f.write('\n\t')
f.write('\n\t')
f.write("\n\t\t")
for caption in captions:
caption = caption.encode('utf-8')
f.write("\n\t\t\t- ")
f.write("\n\t\t\t\t")
f.write(caption)
f.write("\n\t\t\t\t")
if caption in commentaires:
f.write("\n\t\t\t\t")
commentaire = commentaires[caption].encode('utf-8')
f.write(commentaire)
f.write("\n\t\t\t\t")
f.write("\n\t\t\t
")
f.write('\n\t
')
f.write("\n\t\t")
f.write("\n\t")
f.write("\n\t\t")
f.write("\n\t\t")
f.write("\n\t")
f.write("")
###############
#Meant to be the start of the now unnecessary method to parse fb comments from the plugin.
# def parse_page2(self, response):
# with open("TEST.txt", "ab") as f:
# f.write(response.meta["filename"])