import scrapy import sys, re, codecs, shutil from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.contrib.spiders import CrawlSpider, Rule #from scrapy.http.request import Request Don't need this anymore if not able to get fb comments from scrapy.selector import Selector from scrapy.selector import HtmlXPathSelector class MySpider(CrawlSpider): name = "buzzfeed" allowed_domains = [r"www.buzzfeed.com"] start_urls = [ r"http://www.buzzfeed.com/?country=fr" ] # for testing, let's only download pages with two digits in name # in vivo, \d\d should be replaced with .+, so as to allow all links rules = (Rule(SgmlLinkExtractor(allow=[r'.+']), callback='parse_item', follow=True),) def parse_item(self, response): filename = response.url.split("/")[-1] filename = filename[:20] + ".xml" # file name is the first 20 characters. lang = HtmlXPathSelector(response).select('//html/@lang').extract()[0] title = HtmlXPathSelector(response).select("//h1[@id='post-title']/text()").extract()[0] title = title.encode('utf-8') #was getting encoding error; this seems to fix that #this should get the description of the article. Will account for any extraneous linking or anything. descriptions = HtmlXPathSelector(response).select("//div[@id='buzz_header']/hgroup/p[@class='description']/descendant-or-self::text()").extract() description = "" for d in descriptions: description = description + d.encode('utf-8') description.strip() author = HtmlXPathSelector(response).select('//a[@class="byline__author"]/text()').extract()[0] author = author.encode('utf-8') affiliation = HtmlXPathSelector(response).select('//div[@class="byline__title"]/text()').extract()[0] affiliation = affiliation.encode('utf-8') #we want to treat captions one at a time, so don't concat them as is done with description (only done to counteract any possible #extra tags within the description

, not really different objects philosophically) #final remaining problem: string will only return one; but if I do descendant-or-self::text() I can get several #which are treated seperately in loop later. Don't want to concat them, because it's a whole list, and I only #want to join the number to their corresponding text. might need to get the whole list and then concat afterwards #using python stuff rather than pure xpath. captionHeaders = HtmlXPathSelector(response).select('//div[@data-print="body"]/div/div/h2') captions = list() commentaires = dict() for captionHeader in captionHeaders: caption = captionHeader.xpath('string(.)').extract()[0] captions.append(caption) if captionHeader.xpath("../p"): commentaires[caption] = captionHeader.xpath("../p").extract()[0] #info will be the dict that we pass as the meta param to the next request for the Facebook plugin info = {"filename": filename, "title": title, "description": description, "author": author, "affiliation": affiliation, "captions": captions} #this is what we'll use to crawl to the next page to get comments, if any exist. ############ #Attempts to get the Facebook comments. Either of these *should/would* work, if the whole section was not generated #with javascript, but it seems that since that is the case, the only way to get these comments would be to bring #in a whole different package called Selenium. # # commentsPage = HtmlXPathSelector(response).select('//iframe[@class = "fb_ltr"]/@src').extract()[0] # commentsPage = commentsPage.encode('utf-8') # request = scrapy.Request("http://www.example.com/some_page.html", # callback=self.parse_page2, # meta=info) # comments = HtmlXPathSelector(response).select('//div[@class="postText"]/descendant-or-self::text()').extract() # return request ############ if lang == 'fr': with open("TEST.txt", 'ab') as f: f.write('') f.write('\n') f.write("

") f.write('') f.write("\n\t
") f.write("\n\t\t") f.write(title) f.write("\t\t") f.write("\n\t\t") f.write(description) f.write("\t\t") f.write('\n\t
') f.write("\n\t") f.write("\n\t\t") f.write(author) f.write("\t\t") f.write("\n\t\t") f.write(affiliation) f.write("\t\t") f.write('\n\t') f.write('\n\t') f.write("\n\t\t
    ") for caption in captions: caption = caption.encode('utf-8') f.write("\n\t\t\t
  1. ") f.write("\n\t\t\t\t") f.write(caption) f.write("\n\t\t\t\t") if caption in commentaires: f.write("\n\t\t\t\t") commentaire = commentaires[caption].encode('utf-8') f.write(commentaire) f.write("\n\t\t\t\t") f.write("\n\t\t\t
  2. ") f.write('\n\t') f.write("\n\t\t
") f.write("\n\t") f.write("\n\t\t") f.write("\n\t\t") f.write("\n\t") f.write("
") ############### #Meant to be the start of the now unnecessary method to parse fb comments from the plugin. # def parse_page2(self, response): # with open("TEST.txt", "ab") as f: # f.write(response.meta["filename"])