######## Script to make list of all html files or pseudo files ###### # # # Need a start file that will link to other files # # Store the filenames in an ns_set # # Incriment thru the ns_set, read files and add new files # # to the ns_set # # This file will be fed from submit-altavista.html # # # # ****** NEW NEW NEW NEW NEW NEW ******* # # # # 1. This version restricts the number of urls searched to # # 100. # # 2. The minimum interval for retreival is 5 seconds # # 3. Offer of disconnecting and returning email of results # ##################################################################### proc ReturnHead { {title "DHTML Test"} args } { set headtags "" foreach arg $args {append headtags $arg } ns_write "
© Datapro Security Corp. 1999-2002 Webmaster
" } proc exclude_urls_100 {exclude_list url_list } { set return_list "" ###### Exclude urls in url_list that match # with glob type expressions in exclude_list # example: # Exclude list http://zmbh.com/mybasket/* http://zmbh.com/orders* # Will exclude: # http://zmbh.com/mybasket/add.html # Will not exclude: # http://zmbh.com/mybasket ################################### foreach url $url_list { set count 0 foreach pattern $exclude_list { if {[string match $pattern $url]} { incr count break } } if {!$count} { lappend return_list $url } } return $return_list } proc make_absolute_urls_100 {start_page url_list email} { set show_progress "" set new_list "" set list_length [llength $url_list] set parts_of_url [split $start_page "/"] set domain_name http://[lindex $parts_of_url 2] set current_url "[string trim [join [lrange $parts_of_url 3 end] "/"] "/ "]" set last_slash [string last "/" $current_url] set last_part_of_url [string range $current_url [expr $last_slash + 1] end] if { [string last "." $last_part_of_url] > 0} { set current_path [string range $current_url 0 [expr $last_slash - 1]] } else { set current_path $current_url } # ns_write "Last part of url is: $last_part_of_url$show_progress (Processed: $list_length links in $start_page.)
" } return $new_list } set email [ns_queryget email] set base_url [ns_queryget base_url ""] set start_page $base_url set limit_dir [ns_queryget limit_dir] if {$limit_dir == ""} { set parts_of_url [split $start_page "/"] set domain_name http://[lindex $parts_of_url 2] set current_url "/[string trimright [join [lrange $parts_of_url 3 end] "/"] "/ "]" set last_slash [string last "/" $current_url] set last_part_of_url [string range $current_url [expr $last_slash + 1] end] if { [string last "." $last_part_of_url] > 0} { set current_path [string range $current_url 0 [expr $last_slash - 1]] set filename $last_part_of_url } else { set current_path $current_url set filename "" } set limit_dir $domain_name$current_path } set exclude_list [split [ns_queryget exclude_text] "!"] set exclude_list_html "I will search this url for other links.
Only Links below: $limit_dir will be added to my list.
I will not follow links that match the following glob patterns:
$exclude_list_htmlWhen I finish, I will email the result to: $email