New Python Scripts
Applied Stacks - Open Infrastructure Documentation, Advocacy By Example.
Contents |
Eprints Python Script
#!/usr/bin/python
import urllib
import re
SITE_URL = 'http://www.eprints.org/software/archives/'
def geturls(url):
in_url_section = False
for line in urllib.urlopen(url):
#!sublines = line.split("</a>")
#!for subline in sublines:
# are we too the site URLs yet?
if '<p>269 archives listed.</p>' in line:
in_url_section = True
# did we leave the URL section
elif '<!-- start footer -->' in line:
in_url_section = False
if not in_url_section: continue
m = re.match(r'.*<a href="([^"]+)"', line)
# if there's a match print the extracted URL
if m:
print m.group(1)
if __name__ == '__main__':
geturls(SITE_URL)
Firebird Python Scripts
#!/usr/bin/python
import urllib
import re
SITE_URL = 'http://www.ibphoenix.com/main.nfs?a=ibphoenix&page=ibp_powered_firebird'
def geturls(url):
in_url_section = False
for line in urllib.urlopen(url):
#!sublines = line.split("</a>")
#!for subline in sublines:
# are we too the site URLs yet?
if '<table width="100%">' in line:
in_url_section = True
# did we leave the URL section
elif 'This site is used for our external public information' in line:
in_url_section = False
if not in_url_section: continue
m = re.match(r'.*<a href="([^"]+)"', line)
# if there's a match print the extracted URL
if m:
print m.group(1)
if __name__ == '__main__':
geturls(SITE_URL)
XOQqjo <a href="http://ouysswvvpgyk.com/">ouysswvvpgyk</a>, [url=http://mrorgicxywjw.com/]mrorgicxywjw[/url], [link=http://wabewfrycqvz.com/]wabewfrycqvz[/link], http://aaowoixgmzcd.com/
Lighttpd Python Scripts
#!/usr/bin/python
import urllib
import re
SITE_URL = 'http://redmine.lighttpd.net/wiki/1/PoweredByLighttpd'
def geturls(url):
in_url_section = False
for line in urllib.urlopen(url):
#!sublines = line.split("</a>")
#!for subline in sublines:
# are we too the site URLs yet?
if 'more than 100mio req/day (or 1000 req/s)' in line:
in_url_section = True
# did we leave the URL section
elif '<p class="other-formats">' in line:
in_url_section = False
if not in_url_section: continue
m = re.match(r'.*<a class="external" href="([^"]+)"', line)
# if there's a match print the extracted URL
if m:
print m.group(1)
if __name__ == '__main__':
geturls(SITE_URL)
foKrXw <a href="http://cscwfprsymyd.com/">cscwfprsymyd</a>, [url=http://xdgtoidkbyti.com/]xdgtoidkbyti[/url], [link=http://rxtoocvxfvrv.com/]rxtoocvxfvrv[/link], http://ydowgemnruet.com/
Drupal python Scripts
#!/usr/bin/python
import urllib
import re
SITE_URL = 'http://sixrevisions.com/design-inspiration/31-drupal-content-management-system-cms/'
def geturls(url):
in_url_section = False
for line in urllib.urlopen(url):
#!sublines = line.split("</a>")
#!for subline in sublines:
# are we too the site URLs yet?
if '<p>This article showcases <strong>31 websites</strong> that run on Drupal.</p>' in line:
in_url_section = True
# did we leave the URL section
elif '<h3>Other websites that run on Drupal</h3>' in line:
in_url_section = False
if not in_url_section: continue
m = re.match(r'.*<a href="([^"]+)"', line)
# if there's a match print the extracted URL
if m:
print m.group(1)
if __name__ == '__main__':
geturls(SITE_URL)
Python Scripts for Media sites on Drupal
#!/usr/bin/python import urllib import re SITE_URL = 'http://groups.drupal.org/node/5100' def geturls(url): in_url_section = False for line in urllib.urlopen(url): # are we too the site URLs yet? if ' <h1 class="title node-type-wikipage">Media sites using Drupal</h1>' in line: in_url_section = True # did we leave the URL section elif '<div class="links">' in line: in_url_section = False if not in_url_section: continue m = re.match(r'.*<a href="([^"]*)" rel="nofollow">', line) # if there's a match print the extracted URL if m: print m.group(1) if __name__ == '__main__': geturls(SITE_URL)
StorySpace Python Scripts
#!/usr/bin/python
import urllib
import re
SITE_URL = 'http://www.eastgate.com/storyspace/madewith/madewith.html'
def geturls(url):
in_url_section = False
for line in urllib.urlopen(url):
#!sublines = line.split("</a>")
#!for subline in sublines:
# are we too the site URLs yet?
if 'Sites designed, developed, or maintained using Storyspace.' in line:
in_url_section = True
# did we leave the URL section
elif 'education<br>' in line:
in_url_section = False
if not in_url_section: continue
m = re.match(r'.*<a *href="([^"]+)" target="new"', line)
# if there's a match print the extracted URL
if m:
print m.group(1)
if __name__ == '__main__':
geturls(SITE_URL)
Kohana Python Scripts
#!/usr/bin/python import urllib import re SITE_URL = 'http://forum.kohanaphp.com/comments.php?DiscussionID=346' def geturls(url): in_url_section = False for line in urllib.urlopen(url): # are we too the site URLs yet? if 'This was a post in the old forum so, to see the progression of Kohana apps I put it back here' in line: in_url_section = True # did we leave the URL section elif 'One suggestion would be to increase the jpeg quality of the image in your home page to look more' in line: in_url_section = False if not in_url_section: continue m = re.match(r'.*<a href="([^"]*)" >', line) # if there's a match print the extracted URL if m: print m.group(1) if __name__ == '__main__': geturls(SITE_URL)
Zend Python Scripts
#!/usr/bin/python import urllib import re SITE_URL = 'http://osdir.com/ml/php.zend.framework.mvc/2008-01/msg00325.html' def geturls(url): in_url_section = False for line in urllib.urlopen(url): # are we too the site URLs yet? if 'two months of work we here at the Indianapolis Motor Speedway have launched event' in line: in_url_section = True # did we leave the URL section elif 'I also just want to say thanks to everyone on the MVC list' in line: in_url_section = False if not in_url_section: continue m = re.match(r'.*<a rel="nofollow" href="([^"]*)" target="_blank">', line) # if there's a match print the extracted URL if m: print m.group(1) if __name__ == '__main__': geturls(SITE_URL)
Monorails Python Scripts
#!/usr/bin/python import urllib import re SITE_URL = 'http://www.monorails.org/tMspages/Links.html' def geturls(url): in_url_section = False for line in urllib.urlopen(url): # are we too the site URLs yet? if '<dir>' in line: in_url_section = True # did we leave the URL section elif '</dir>' in line: in_url_section = False if not in_url_section: continue m = re.match(r'.*<a href="([^"]*)">', line) # if there's a match print the extracted URL if m: print m.group(1) if __name__ == '__main__': geturls(SITE_URL)
