Difference between revisions of "Moroccan Parliament Web Crawler"

From edegan.com
Jump to navigation Jump to search
Line 63: Line 63:
 
     import string
 
     import string
 
     import re
 
     import re
 +
    ##########################
 
     #launch Google Chrome Browser
 
     #launch Google Chrome Browser
 
     driver = webdriver.Chrome()
 
     driver = webdriver.Chrome()
 
+
    ##########################
 
     def switch_window():
 
     def switch_window():
 
         handles = driver.window_handles
 
         handles = driver.window_handles
 
         driver.switch_to_window(handles[-1])
 
         driver.switch_to_window(handles[-1])
 
+
    ##########################
 
     #Visit desired website
 
     #Visit desired website
 
     driver.get('http://www.chambredesrepresentants.ma/ar/%D8%A7%D9%84%D8%AA%D8%B4%D8%B1%D9%8A%D8%B9/%D9%84%D8%A7%D8%A6%D8%AD%D8%A9-            %D9%85%D9%82%D8%AA%D8%B1%D8%AD%D8%A7%D8%AA-%D8%A7%D9%84%D9%82%D9%88%D8%A7%D9%86%D9%8A%D9%86?body_value=&field_og_commission_target_id=All')
 
     driver.get('http://www.chambredesrepresentants.ma/ar/%D8%A7%D9%84%D8%AA%D8%B4%D8%B1%D9%8A%D8%B9/%D9%84%D8%A7%D8%A6%D8%AD%D8%A9-            %D9%85%D9%82%D8%AA%D8%B1%D8%AD%D8%A7%D8%AA-%D8%A7%D9%84%D9%82%D9%88%D8%A7%D9%86%D9%8A%D9%86?body_value=&field_og_commission_target_id=All')
 
+
    ##########################
 
     bills_list = driver.find_elements_by_xpath("//li/h3/a")
 
     bills_list = driver.find_elements_by_xpath("//li/h3/a")
 
     for i in range(len(bills_list)):
 
     for i in range(len(bills_list)):
Line 78: Line 79:
 
         bills_list[i].click()
 
         bills_list[i].click()
 
         ActionChains(driver).key_up(Keys.SHIFT).perform()
 
         ActionChains(driver).key_up(Keys.SHIFT).perform()
 
 
         switch_window()
 
         switch_window()
 
         url = driver.current_url
 
         url = driver.current_url
         unicode_url =  urllib.unquote(str(url)).decode('utf8')
+
         unicode_url =  urllib.unquote(str(url)).decode('utf8')  
   
 
 
         url_parts = string.split(unicode_url, "/")
 
         url_parts = string.split(unicode_url, "/")
 
         i = len(url_parts)
 
         i = len(url_parts)
   
+
      ##########################
 
 
 
         #Build arabic tag backwards, accounting for backwards spelling
 
         #Build arabic tag backwards, accounting for backwards spelling
 
         tag = ""
 
         tag = ""
Line 92: Line 90:
 
             tag += url_parts[i - 1]
 
             tag += url_parts[i - 1]
 
             i -= 1
 
             i -= 1
 +
      ##########################
 
         #Navigate to pdf of website
 
         #Navigate to pdf of website
 
         change_button  = driver.find_elements_by_xpath("//a [@class='pdf' and @rel='nofollow']")[0]
 
         change_button  = driver.find_elements_by_xpath("//a [@class='pdf' and @rel='nofollow']")[0]
Line 97: Line 96:
 
         change_button.click()
 
         change_button.click()
 
         ActionChains(driver).key_up(Keys.SHIFT).perform()
 
         ActionChains(driver).key_up(Keys.SHIFT).perform()
 
 
         switch_window()
 
         switch_window()
 
+
      #########################
 
         #Gets current window's URL
 
         #Gets current window's URL
 
         url = driver.current_url
 
         url = driver.current_url
 
+
      ########################
 
         #Saves file at URL to current directory  
 
         #Saves file at URL to current directory  
 
         urllib.urlretrieve(url, tag)
 
         urllib.urlretrieve(url, tag)
 
 
         driver.close()
 
         driver.close()
 
 
         switch_window()
 
         switch_window()
     
+
      #########################
 
         pdfs_on_page = driver.find_elements_by_xpath("//div/div/div/article/div/ul/li/a")
 
         pdfs_on_page = driver.find_elements_by_xpath("//div/div/div/article/div/ul/li/a")
 
 
         #finds interior pdfs on the page
 
         #finds interior pdfs on the page
 
         if pdfs_on_page:
 
         if pdfs_on_page:
 
             for j in range(len(pdfs_on_page)):
 
             for j in range(len(pdfs_on_page)):
 
                 element = pdfs_on_page[j]
 
                 element = pdfs_on_page[j]
 
+
              #######################
 
                 #click on pdf
 
                 #click on pdf
 
                 ActionChains(driver).key_down(Keys.SHIFT).perform()
 
                 ActionChains(driver).key_down(Keys.SHIFT).perform()
 
                 element.click()
 
                 element.click()
 
                 ActionChains(driver).key_up(Keys.SHIFT).perform()
 
                 ActionChains(driver).key_up(Keys.SHIFT).perform()
 
 
                 switch_window()
 
                 switch_window()
 
 
                 url = driver.current_url
 
                 url = driver.current_url
 
                 pdf_tag = string.split(str(url), "/")[-1]
 
                 pdf_tag = string.split(str(url), "/")[-1]
 
+
              ######################
 
                 #leaves link if it is not a pdf
 
                 #leaves link if it is not a pdf
 
                 if re.findall(".pdf", pdf_tag):
 
                 if re.findall(".pdf", pdf_tag):
 
 
                     #saves interior pdf
 
                     #saves interior pdf
                     urllib.urlretrieve(url, pdf_tag)
+
                     urllib.urlretrieve(url, pdf_tag)              
                   
 
 
                 driver.close()
 
                 driver.close()
 
 
                 switch_window()
 
                 switch_window()
          
+
         ########################
 
         driver.close()
 
         driver.close()
 
 
         switch_window()
 
         switch_window()
   
+
  #########################
 
     print "download complete"   
 
     print "download complete"   
 
+
    #########################
 
     #close browser
 
     #close browser
 
     driver.quit()
 
     driver.quit()

Revision as of 14:47, 13 October 2016


McNair Project
Moroccan Parliament Web Crawler
Project logo 02.png
Project Information
Project Title
Start Date
Deadline
Primary Billing
Notes
Has project status
Copyright © 2016 edegan.com. All Rights Reserved.


Overview

This web driver is designed to save information from the Moroccan Legislature website as a pdf before the information is removed from the website due to lack of space.

Navigating the Site

On the right hand side of the website, the final bullet is a link to archived bills.

On the right hand side of the website, one bullet above the last header, is a link to links of all the proposed bills. When clicked, the user is directed to the most recent batch of links of proposed bills. The movement button on the bottom left in arabic means, "go to last page", and the second from the left means "previous page". When "go to last page" is clicked, the batch of links to proposed bills that are about to be removed from the website are listed with the oldest link at the bottom of the list.

When any proposed bill link is clicked, it opens up a page with information about the bill, and about discussions in parliament. There are up to two hyperlinks on this page, the first of which is a pdf of the original bill. If there is a second link further below, it contains a pdf of the Committee Report for the bill (if it was sent to a Committee).

The two pdfs, as well as the webpage of the proposed bill may be subject to record keeping.


Main Page

Driving Opportunities

Five websites within the Moroccan Legislature site have data that needs to be recorded.

Moroccan Monarchy Proposed Bills

Monarchy Proposed Bills

The data that needs to be extracted from this site includes the pdfs of all the bill pages, as well as any interior pdfs on each page. The bill pages should be named by their url, and the interior pdfs should be named by their respective bill numbers.

Moroccan House of Representatives Proposed Bills

House Proposed Bills

See Monarchy proposed bills for instructions.

Moroccan Legislature Ratified Bills

Ratified Bills

See Monarchy proposed bills for instructions.

Moroccan Legislature Oral Questions

Oral Questions

Moroccan Legislature Written Questions

Written Questions


Example Code

   #General Bill Download
   from selenium import webdriver
   from selenium.webdriver.common.action_chains import ActionChains
   from selenium.webdriver.common.keys import Keys
   import time
   import urllib
   import string
   import re
   ##########################
   #launch Google Chrome Browser
   driver = webdriver.Chrome()
   ##########################
   def switch_window():
       handles = driver.window_handles
       driver.switch_to_window(handles[-1])
   ##########################
   #Visit desired website
   driver.get('http://www.chambredesrepresentants.ma/ar/%D8%A7%D9%84%D8%AA%D8%B4%D8%B1%D9%8A%D8%B9/%D9%84%D8%A7%D8%A6%D8%AD%D8%A9-            %D9%85%D9%82%D8%AA%D8%B1%D8%AD%D8%A7%D8%AA-%D8%A7%D9%84%D9%82%D9%88%D8%A7%D9%86%D9%8A%D9%86?body_value=&field_og_commission_target_id=All')
   ##########################
   bills_list = driver.find_elements_by_xpath("//li/h3/a")
   for i in range(len(bills_list)):
       ActionChains(driver).key_down(Keys.SHIFT).perform()
       bills_list[i].click()
       ActionChains(driver).key_up(Keys.SHIFT).perform()
       switch_window()
       url = driver.current_url
       unicode_url =  urllib.unquote(str(url)).decode('utf8') 
       url_parts = string.split(unicode_url, "/")
       i = len(url_parts)
      ##########################
       #Build arabic tag backwards, accounting for backwards spelling
       tag = ""
       while i > 4:
           tag += url_parts[i - 1]
           i -= 1
      ##########################
       #Navigate to pdf of website
       change_button  = driver.find_elements_by_xpath("//a [@class='pdf' and @rel='nofollow']")[0]
       ActionChains(driver).key_down(Keys.SHIFT).perform()
       change_button.click()
       ActionChains(driver).key_up(Keys.SHIFT).perform()
       switch_window()
      #########################
       #Gets current window's URL
       url = driver.current_url
      ########################
       #Saves file at URL to current directory 
       urllib.urlretrieve(url, tag)
       driver.close()
       switch_window()
     ######################### 
       pdfs_on_page = driver.find_elements_by_xpath("//div/div/div/article/div/ul/li/a")
       #finds interior pdfs on the page
       if pdfs_on_page:
           for j in range(len(pdfs_on_page)):
               element = pdfs_on_page[j]
              #######################
               #click on pdf
               ActionChains(driver).key_down(Keys.SHIFT).perform()
               element.click()
               ActionChains(driver).key_up(Keys.SHIFT).perform()
               switch_window()
               url = driver.current_url
               pdf_tag = string.split(str(url), "/")[-1]
              ######################
               #leaves link if it is not a pdf
               if re.findall(".pdf", pdf_tag):
                   #saves interior pdf
                   urllib.urlretrieve(url, pdf_tag)                
               driver.close()
               switch_window()
       ########################
       driver.close()
       switch_window()
  ######################### 
   print "download complete"  
   #########################
   #close browser
   driver.quit()

Further Inquiries

Further inquiries have been requested for the Kuwait, Tunisian, and Algerian Parliaments.

Kuwait Parliament

Kuwait Site

Data to download (Tables and PDF files): Monarchy - Proposed Bills (مشروع بقانون); For 13th and 14th terms. Parliament - Proposed Bills (اقتراح بقانون); for 14th term: Proposals (اقتراح بــرغــبـــة): For 13th term and 14th terms Meeting Agendas/Minutes (جدول اعمال الجلسه); for 14th term

Tunisian Parliament

Tunisian Site

Data to Download(save web pages as PDFs and documents as PDFs):

Bills proposed to parliament(مشروع قانون معروض على المجلس): click here Bills proposed to legislative committees (مشروع قانون معروض على اللجان): click here General Assembly Deliberations (الجلسات العامة): click here


Algerian Parliament

Algerian Site

Data to download: Bills that have been voted on (القوانين المصوت عليها الفترة التشريعية السابعة : 2012 - 2017): click here Parliamentary initiatives/questions (المبادرات البرلمانية): click here