# # html.pl --- extract, normalize and hypertextify URLs in HTML files # # Copyright (c) 1995 Oscar Nierstrasz # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation; either version 2 of the License, or (at # your option) any later version. # # This program is distributed in the hope that it will be useful, but # WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU # General Public License for more details. # # You should have received a copy of the GNU General Public License # along with this program (as the file COPYING in the main directory of # the distribution); if not, write to the Free Software Foundation, # Inc., 675 Mass Ave, Cambridge, MA 02139, USA. # # NB: If this package interests you, you should probably # have a look at Roy Fielding's libwww-perl packages: # http://www.ics.uci.edu/WebSoft/libwww-perl/ # # This package and friends can be found at: # http://iamwww.unibe.ch/~scg/Src/ # # This package contains: # # html'href: identify URLs and turn them into hypertext links # html'abs: convert relative URLs to absolute ones # html'parse: parse an URL and return ($type,$host,$port,$path,$request) # html'hrefs: return all hrefs in a page # html'esc: escape characters in plain text # BUGS: Craig Allen points out that binary files # that contain " references are NOT recognized by html'abs -- this needs # to be fixed! ... # &parse does not current handle firewall URLs # &href can choke on "a...b" patterns package html; #v = 'html.pl v1.0'; # Oscar Nierstrasz 26/8/93 #v = 'html.pl v1.1'; # 15/01/94 # -- fixed html'abs to handle HREFs without surrounding quotes #v = 'html.pl v1.2'; # 09/02/94 # -- fixed html'abs to handle images as well #v = 'html.pl v1.3'; # 24/3/94 # -- added hrefs (from `explore') #v = 'html.pl v1.4'; # 25/3/94 # -- fixed hrefs to handle malformed HREFs (missing or extra quotes) #v = 'html.pl v1.5'; # 25/3/94 # -- fixed abs to leave internal refs alone! #v = 'html.pl v1.6'; # 25/3/94 # -- moved to separate package #v = 'html.pl v1.7'; # 13/4/94 # -- repaired abs() to handle HREFs with missing quotes #v = 'html.pl v1.8'; # 25/5/94 # -- modified parse() to handle empty protocol type #v = 'html.pl v1.9'; # 1/2/95 # -- fixed abs to search for closing ">" #v = 'html.pl v1.10'; # 3/2/95 # -- fixed href() to use ^A as EOL placeholder (instead of #) #v = 'html.pl v1.11'; # 20/2/95 Gorm Haug Eriksen # -- fixed some bugs in abs #v = 'html.pl v1.12'; # 8/7/95 # -- added mailto: treatment (oscar) [may conflict with ftp formats?] # NEEDS further testing ... #v = 'html.pl v1.13'; # 9/7/95 # -- added &makequery #v = 'html.pl v1.14'; # 16/8/95 # -- fixed backslash handling in &makequery #v = 'html.pl v1.15'; # 27/9/95 # Nicolai Langfeldt (janl@ifi.uio.no) # 95/09/19 -- Added canonize function that makes a html doc a # bit more of of a SGML document. #v = 'html.pl v1.16'; # 16/10/95 -- assume just 2-3 chars in country code, Daniel Quinlan #$v = 'html.pl v1.17'; # 16/10/95 -- modified more by Daniel Quinlan # use capitalized tags # uncommented mailto: treatment and modified to also allow `.' and `-' $v = 'html.pl v1.18'; # 28/03/02 -- modified more by Daniel Quinlan # obfuscate email addresses # Try to recognize URLs and ftp file indentifiers and convert them into HREFs: # This routine is evolving. The patterns are not perfect. # This is really a parsing problem, and not a job for perl ... # It is also generally impossible to distinguish ftp site names # from newsgroup names if the ":" is missing. # An arbitrary file name ("runtime.pl") can also be confused. sub href { # study; # doesn't speed things up ... # to avoid special cases for beginning & end of line s|^||; s|$||; # URLS: : s|(news:[\w.]+)|$&|g; s|(http:[\w/.:+\-~#?]+)|$&|g; s|(file:[\w/.:+\-]+)|$&|g; s|(ftp:[\w/.:+\-]+)|$&|g; s|(wais:[\w/.:+\-]+)|$&|g; s|(gopher:[\w/.:+\-]+)|$&|g; s|(telnet:[\w/.:+\-]+)|$&|g; # s|(\w+://[\w/.:+\-]+)|$&|g; # catch some newsgroups to avoid confusion with sites: s|([^\w\-/.:@>])(alt\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(bionet\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(bit\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(comp\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(gnu\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(misc\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(news\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(rec\.[\w.+\-]+[\w+\-]+)|$1$2|g; s|([^\w\-/.:@>])(ch\.[\w.+\-]+[\w+\-]+)|$1$2|g; # FTP locations (with directory): # anonymous@: s|(anonymous@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,3}):(\s*)([\w\d+\-/.]+)|$1$2:$4$3|g; # ftp@: s|(ftp@)([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,3}):(\s*)([\w\d+\-/.]+)|$1$2:$4$3|g; # : s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,3}):(\s*)([\w\d+\-/.]+)|$1$2:$4$3|g; # NB: don't confuse an http server with a port number for # an FTP location! # internet number version: : s|([^\w\-/.:@])(\d{2,}\.\d{2,}\.\d+\.\d+):([\w\d+\-/.]+)|$1$2:$3|g; # just the site name (assume two dots, ends in .xx or .xxx): s|([^\w\-/.:@>])([a-zA-Z][\w+\-]+\.[\w.+\-]+\.[a-zA-Z]{2,3})([^\w\d\-/.:!])|$1$2$3|g; # NB: can be confused with newsgroup names! # .com has only one dot: s|([^\w\-/.:@>])([a-zA-Z][\w.+\-]+\.com)([^\w\-/.:])|$1$2$3|g; # just internet numbers: s|([^\w\-/.:@])(\d+\.\d+\.\d+\.\d+)([^\w\-/.:])|$1$2$3|g; # unfortunately inet numbers can easily be confused with # european telephone numbers ... # Mail addresses: s|([\w.-]+)@([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,3})|address omitted|g; ### s|([\w.-]+)@([a-zA-Z][\w.+\-]+\.[a-zA-Z]{2,3})|$&|g; s|^||; s|$||; } sub printa { @tmpbah = @_; foreach (@tmpbah) { print "-> $_\n"; } } # convert relative http URLs to absolute ones: # BUG: minor problem with binary files containing "]*)"?|HREF="$root/$1"|i) && next; # this was changed, because it didn't manage urls with "'s in link # ($hrefs[$n] =~ s|href\s*=\s*"?/([^">]*)(.*)"\s*>(.*)$|HREF="$root/$1"$2>$3|i) && next; # relative from $path: # # $hrefs[$n] =~ s|href\s*=\s*"?([^/"][^">]*)"?|HREF="$root$path$1"|i; # $hrefs[$n] =~ s|href\s*=\s*"?([^/"][^">]*)"?.*>|HREF="$root$path$1"|i; $hrefs[$n] =~ s|href\s*=\s*"?([^/"][^">]*)(.*)"\s*>(.*)|HREF="$root$path$1"$2>$3|i; # collapse relative paths: $hrefs[$n] =~ s|/\./|/|g; while ($hrefs[$n] =~ m|/\.\./|) { $hrefs[$n] =~ s|[^/]*/\.\./||; } } # Actually, this causes problems for binary files # that just happen to include the sequence "]*)"?|SRC="$root/$1"|i) && next; # ($hrefs[$n] =~ s|src\s*=\s*"?/([^"?>]*)"?(.*)>|SRC="$root/$1$2>"|i) && next; ($hrefs[$n] =~ s|src\s*=\s*"?/([^"?>]*)"?(.*)>|SRC="$root/$1"$2>|i) && next; # relative from $path: # $hrefs[$n] =~ s|src\s*=\s*"?([^/"][^">]*)"?|SRC="$root$path$1"|i; # $hrefs[$n] =~ s|src\s*=\s*"?([^/"][^">]*)"?.*>|SRC="$root$path$1"|i; $hrefs[$n] =~ s|src\s*=\s*"?([^/"][^">]*)"?\s+(.*)|SRC="$root$path$1" $2|i; # collapse relative paths: $hrefs[$n] =~ s|/\./|/|g; while ($hrefs[$n] =~ m|/\.\./|) { $hrefs[$n] =~ s|[^/]*/\.\./||; } } join(":" if ($url =~ /^(\w+):+(.*)/) { $type = $1; $request = $2; } # relative URL of form "" else { $request = $url; } $request =~ s|^$|/|; $request =~ s|^([^/])|$path$1|; # relative path $request =~ s|/\./|/|g; while ($request =~ m|/\.\./|) { $request =~ s|[^/]*/\.\./||; } # assume previous host & port: unless ($host) { # $! = "html'parse: no host for $url\n"; print STDERR "html'parse: no host for $url\n"; return (undef,undef,undef,undef,undef); } } ($type,$host,$port,$path,$request); } # default ports sub defport { local($type) = @_; if ($type eq "http") { 80; } elsif ($type eq "gopher") { 70; } else { undef; } } # return a list of all the hrefs in a page sub hrefs { local($page) = @_; $page =~ s/^[^<]+[^<]*[^<]+$/>/; $page =~ s/]*href\s*=\s*"?([^">]+)[^>]*>/$1\n/gi; $page =~ s/]*src\s*=\s*"?([^">]+)[^>]*>/$1\n/gi; $page =~ s/<[^>]*>//g; $page =~ s/#.*//g; $page =~ s/\n+/\n/g; split(/\n/,$page); } # escape characters in plain text: sub esc { s/&/&/g; s//>/g; } # escape special chars in a string so it can be used as an isindex query: sub makequery { local($qry) = @_; $qry =~ s| |+|g; # escape blanks #qry =~ s|\\|\\\\|g; # escape backslashes $qry =~ s|\\|%5C%5C|g; # escape backslashes #qry =~ s|([()])|\\$1|g; # escape parens etc. $qry =~ s|([()])|%5C$1|g; # escape parens etc. $qry; } sub canonize { # Canonize a nonconformant SGML (HTML) document to a more conformant # document, the general structure of a conformant document is: # # # ... # We'll insert whatever is missing of those. This allows automatic # type recognition of html files on disk, which is usefull, # janl 95/09/19 local($*)=1; local($add)=''; $add.=''."\n" unless ($_[$[] =~ m~'."\n" unless $_[$[] =~ m~.. then we'll have # a problem ( will be outside the other SGML tags). # The right way to do it would be to extract and remove the SGML # tags, then insert everything in order. But that would require # too much messy memory copies on a potentialy Very Large string. # But, it's very unlikely that someone will include the SGML tags # and then not include ... $add.=''."\n" unless $_[$[] =~ m~~i; $_[$[].="\n" unless $_[$[] =~ m~~i; substr($_[$[],$[,0)=$add; } 1;