######## Script to make list of all html files or pseudo files ######
# #
# Need a start file that will link to other files #
# Store the filenames in an ns_set #
# Incriment thru the ns_set, read files and add new files #
# to the ns_set #
# This file will be fed from submit-altavista.html #
# #
# ****** NEW NEW NEW NEW NEW NEW ******* #
# #
# 1. This version restricts the number of urls searched to #
# 100. #
# 2. The minimum interval for retreival is 5 seconds #
# 3. Offer of disconnecting and returning email of results #
#####################################################################
proc exclude_urls_100 {exclude_list url_list } {
set return_list ""
###### Exclude urls in url_list that match
# with glob type expressions in exclude_list
# example:
# Exclude list http://zmbh.com/mybasket/* http://zmbh.com/orders*
# Will exclude:
# http://zmbh.com/mybasket/add.html
# Will not exclude:
# http://zmbh.com/mybasket
###################################
foreach url $url_list {
set count 0
foreach pattern $exclude_list {
if {[string match $pattern $url]} {
incr count
break
}
}
if {!$count} {
lappend return_list $url
}
}
return $return_list
}
proc make_absolute_urls_100 {start_page url_list email} {
set show_progress ""
set new_list ""
set list_length [llength $url_list]
set parts_of_url [split $start_page "/"]
set domain_name http://[lindex $parts_of_url 2]
set current_url "[string trim [join [lrange $parts_of_url 3 end] "/"] "/ "]"
set last_slash [string last "/" $current_url]
set last_part_of_url [string range $current_url [expr $last_slash + 1] end]
if { [string last "." $last_part_of_url] > 0} {
set current_path [string range $current_url 0 [expr $last_slash - 1]]
} else {
set current_path $current_url
}
# ns_write "Last part of url is: $last_part_of_url
"
for {set i 0} {$i < $list_length} {incr i} {
# Check to see if the link is relative:
set url [lindex $url_list $i]
# remove any links with pound signs
if {[string match "*\#*" $url]} { continue }
if {![string match "http://*" [string tolower $url]] } {
# ns_write "'$url' is relative
"
# Now you need to convert it to absolute
# set first_char [string range $url 0 0]
switch -glob $url {
"mailto:*" - "ftp:*" - "news:*" {
append show_progress "."
continue
}
".*" {
# Refers to item needing normalizing above
lappend new_list $domain_name[ns_normalizepath $current_path/$url]
#ns_write "$url made absolute is $domain_name[ns_normalizepath $current_path/$url]
"
append show_progress "."
continue
}
"/*" {
# Refers to item relative to the root directory
# Fix by appending rest of url to current path.
lappend new_list $domain_name[ns_normalizepath $url]
#ns_write "$url made absolute is $base_url/[string range $url 2 end]
"
#ns_write "$url made absolute is $domain_name[ns_normalizepath $url]
"
append show_progress "."
continue
}
default {
# Didnt start with / or . must just be file
# or directory name relative to the current directory
if {$current_path == ""} {
lappend new_list $domain_name/$url
#ns_write "$url made absolute is $domain_name/$url
"
} else {
lappend new_list $domain_name/$current_path/$url
#ns_write "$url made absolute is $domain_name/$current_path/$url
"
}
append show_progress "."
continue
}
}
} else {
# use the unconverted url
regsub -nocase "http://" $url "http://" url
lappend new_list $url
}
}
if {$email == "" } {
ns_write "
$show_progress (Processed: $list_length links in $start_page.)
" } return $new_list } set email [ns_queryget email] set base_url [ns_queryget base_url ""] set start_page $base_url set limit_dir [ns_queryget limit_dir] if {$limit_dir == ""} { set parts_of_url [split $start_page "/"] set domain_name http://[lindex $parts_of_url 2] set current_url "/[string trimright [join [lrange $parts_of_url 3 end] "/"] "/ "]" set last_slash [string last "/" $current_url] set last_part_of_url [string range $current_url [expr $last_slash + 1] end] if { [string last "." $last_part_of_url] > 0} { set current_path [string range $current_url 0 [expr $last_slash - 1]] set filename $last_part_of_url } else { set current_path $current_url set filename "" } set limit_dir $domain_name$current_path } set exclude_list [split [ns_queryget exclude_text] "!"] set exclude_list_html "I will search this url for other links.
Only Links below: $limit_dir will be added to my list.
I will not follow links that match the following glob patterns:
$exclude_list_htmlWhen I finish, I will email the result to: $email