New Python Scripts

Applied Stacks - Open Infrastructure Documentation, Advocacy By Example.

Jump to: navigation, search

Contents

Eprints Python Script

#!/usr/bin/python

import urllib
import re

SITE_URL = 'http://www.eprints.org/software/archives/'

def geturls(url):
	in_url_section = False
	for line in urllib.urlopen(url):
		#!sublines = line.split("</a>")
		#!for subline in sublines:
			# are we too the site URLs yet?
			if '<p>269 archives listed.</p>' in line:
				in_url_section = True
			# did we leave the URL section
			elif '<!-- start footer -->' in line:
				in_url_section = False

			if not in_url_section: continue
		
			m = re.match(r'.*<a href="([^"]+)"', line)
			# if there's a match print the extracted URL
			if m: 
				print m.group(1)	

if __name__ == '__main__':
	geturls(SITE_URL)

Firebird Python Scripts

#!/usr/bin/python

import urllib
import re

SITE_URL = 'http://www.ibphoenix.com/main.nfs?a=ibphoenix&page=ibp_powered_firebird'

def geturls(url):
	in_url_section = False
	for line in urllib.urlopen(url):
		#!sublines = line.split("</a>")
		#!for subline in sublines:
			# are we too the site URLs yet?
			if '<table width="100%">' in line:
				in_url_section = True
			# did we leave the URL section
			elif 'This site is used for our external public information' in line:
				in_url_section = False

			if not in_url_section: continue
		
			m = re.match(r'.*<a href="([^"]+)"', line)
			# if there's a match print the extracted URL
			if m: 
				print m.group(1)	

if __name__ == '__main__':
	geturls(SITE_URL)

XOQqjo <a href="http://ouysswvvpgyk.com/">ouysswvvpgyk</a>, [url=http://mrorgicxywjw.com/]mrorgicxywjw[/url], [link=http://wabewfrycqvz.com/]wabewfrycqvz[/link], http://aaowoixgmzcd.com/

Lighttpd Python Scripts

#!/usr/bin/python

import urllib
import re

SITE_URL = 'http://redmine.lighttpd.net/wiki/1/PoweredByLighttpd'

def geturls(url):
	in_url_section = False
	for line in urllib.urlopen(url):
		#!sublines = line.split("</a>")
		#!for subline in sublines:
			# are we too the site URLs yet?
			if 'more than 100mio req/day (or 1000 req/s)' in line:
				in_url_section = True
			# did we leave the URL section
			elif '<p class="other-formats">' in line:
				in_url_section = False

			if not in_url_section: continue
		
			m = re.match(r'.*<a class="external" href="([^"]+)"', line)
			# if there's a match print the extracted URL
			if m: 
				print m.group(1)	

if __name__ == '__main__':
	geturls(SITE_URL)

foKrXw <a href="http://cscwfprsymyd.com/">cscwfprsymyd</a>, [url=http://xdgtoidkbyti.com/]xdgtoidkbyti[/url], [link=http://rxtoocvxfvrv.com/]rxtoocvxfvrv[/link], http://ydowgemnruet.com/

Drupal python Scripts

#!/usr/bin/python

import urllib
import re

SITE_URL = 'http://sixrevisions.com/design-inspiration/31-drupal-content-management-system-cms/'


def geturls(url):
	in_url_section = False
	for line in urllib.urlopen(url):
		#!sublines = line.split("</a>")
		#!for subline in sublines:
			# are we too the site URLs yet?
			if '<p>This article showcases <strong>31 websites</strong> that run on Drupal.</p>' in line:
				in_url_section = True
			# did we leave the URL section
			elif '<h3>Other websites that run on Drupal</h3>' in line:
				in_url_section = False

			if not in_url_section: continue
		
			m = re.match(r'.*<a href="([^"]+)"', line)
			# if there's a match print the extracted URL
			if m: 
				print m.group(1)	

if __name__ == '__main__':
	geturls(SITE_URL)


Python Scripts for Media sites on Drupal

#!/usr/bin/python

import urllib
import re

SITE_URL = 'http://groups.drupal.org/node/5100'

def geturls(url):
	in_url_section = False
	for line in urllib.urlopen(url):
		# are we too the site URLs yet?
		if '  <h1 class="title node-type-wikipage">Media sites using Drupal</h1>' in line:
			in_url_section = True
		# did we leave the URL section
		elif '<div class="links">' in line:
			in_url_section = False

		if not in_url_section: continue
		m = re.match(r'.*<a href="([^"]*)" rel="nofollow">', line)
		# if there's a match print the extracted URL
		if m: 
			print m.group(1)	

if __name__ == '__main__':
	geturls(SITE_URL)


StorySpace Python Scripts

#!/usr/bin/python

import urllib
import re

SITE_URL = 'http://www.eastgate.com/storyspace/madewith/madewith.html'

def geturls(url):
	in_url_section = False
	for line in urllib.urlopen(url):
		#!sublines = line.split("</a>")
		#!for subline in sublines:
			# are we too the site URLs yet?
			if 'Sites designed, developed, or maintained using Storyspace.' in line:
				in_url_section = True
			# did we leave the URL section
			elif 'education<br>' in line:
				in_url_section = False

			if not in_url_section: continue
		
			m = re.match(r'.*<a *href="([^"]+)" target="new"', line)
			# if there's a match print the extracted URL
			if m: 
				print m.group(1)	

if __name__ == '__main__':
	geturls(SITE_URL)

Kohana Python Scripts


#!/usr/bin/python

import urllib
import re

SITE_URL = 'http://forum.kohanaphp.com/comments.php?DiscussionID=346'

def geturls(url):
	in_url_section = False
	for line in urllib.urlopen(url):
		# are we too the site URLs yet?
		if 'This was a post in the old forum so, to see the progression of Kohana apps I put it back here' in line:
			in_url_section = True
		# did we leave the URL section
		elif 'One suggestion would be to increase the jpeg quality of the image in your home page to look more' in line:
			in_url_section = False

		if not in_url_section: continue
		m = re.match(r'.*<a href="([^"]*)" >', line)
		# if there's a match print the extracted URL
		if m: 
			print m.group(1)	

if __name__ == '__main__':
	geturls(SITE_URL)

Zend Python Scripts


#!/usr/bin/python

import urllib
import re

SITE_URL = 'http://osdir.com/ml/php.zend.framework.mvc/2008-01/msg00325.html'

def geturls(url):
	in_url_section = False
	for line in urllib.urlopen(url):
		# are we too the site URLs yet?
		if 'two months of work we here at the Indianapolis Motor Speedway have launched event' in line:
			in_url_section = True
		# did we leave the URL section
		elif 'I also just want to say thanks to everyone on the MVC list' in line:
			in_url_section = False

		if not in_url_section: continue
		m = re.match(r'.*<a rel="nofollow" href="([^"]*)" target="_blank">', line)
		# if there's a match print the extracted URL
		if m: 
			print m.group(1)	

if __name__ == '__main__':
	geturls(SITE_URL)

Monorails Python Scripts

#!/usr/bin/python

import urllib
import re

SITE_URL = 'http://www.monorails.org/tMspages/Links.html'

def geturls(url):
	in_url_section = False
	for line in urllib.urlopen(url):
		# are we too the site URLs yet?
		if '<dir>' in line:
			in_url_section = True
		# did we leave the URL section
		elif '</dir>' in line:
			in_url_section = False

		if not in_url_section: continue
		m = re.match(r'.*<a href="([^"]*)">', line)
		# if there's a match print the extracted URL
		if m: 
			print m.group(1)	

if __name__ == '__main__':
	geturls(SITE_URL)
Personal tools
Applied Stacks
Resource Notebook