| 6 | | * PDFminer is a tool to convert pdf docs into text, it is open source [http://www.unixuser.org/~euske/python/pdfminer/index.html#license (Licence)]. Some hacking in the souce code will is a good option for coding IMPORTING TOOL [http://trac.sahanapy.org/wiki/SpreadsheetImporter Spreadsheet Importer] |
| | 6 | * PDFminer is a tool to convert pdf docs into text, it is open source [http://www.unixuser.org/~euske/python/pdfminer/index.html#license (Licence)]. Some hacking in the souce code will is a good option for coding IMPORTING TOOL [http://trac.sahanapy.org/wiki/SpreadsheetImporter Spreadsheet Importer] by codestasher |
| | 7 | * Code snippet to extract hyperlinks from HTML docs. |
| | 8 | {{{ |
| | 9 | import sgmllib |
| | 10 | |
| | 11 | class MyParser(sgmllib.SGMLParser): |
| | 12 | |
| | 13 | def parse(self, s): |
| | 14 | self.feed(s) |
| | 15 | self.close() |
| | 16 | |
| | 17 | def __init__(self, verbose=0): |
| | 18 | sgmllib.SGMLParser.__init__(self, verbose) |
| | 19 | self.hyperlinks = [] |
| | 20 | |
| | 21 | def start_a(self, attributes): |
| | 22 | for name, value in attributes: |
| | 23 | if name == "href": |
| | 24 | self.hyperlinks.append(value) |
| | 25 | |
| | 26 | def get_hyperlinks(self): |
| | 27 | return self.hyperlinks |
| | 28 | |
| | 29 | import urllib, sgmllib |
| | 30 | |
| | 31 | f = urllib.urlopen("http://www.python.org") |
| | 32 | s = f.read() |
| | 33 | |
| | 34 | |
| | 35 | myparser = MyParser() |
| | 36 | myparser.parse(s) |
| | 37 | |
| | 38 | |
| | 39 | print myparser.get_hyperlinks() |
| | 40 | |
| | 41 | }}} |
| | 42 | by codestasher |
| | 43 | |