2.2 The main subroutine function in surfInternet.c:
buildURL()
To combine the protocol name, host name, file name to a URL.
downLoadFindKey()
To connect the server in the Internet using a URL, and then wait for reply.
Using string cat function to store the Web page to an array when there are something coming back from the server.
To search a keyword in the array. If find that, save the URL to "urlAddr.doc" file.
extractURL()
To find URL from the download Web page. It can deal with different kind of URL appearances including white space in it:
http://www.sbu.ac.uk/dynamic/homepage/
http://dynamic/homepage/
//www.sbu.ac.uk/dynamic/homepage/
//www.sbu.ac.uk/
//www.sbu.ac.uk
/dynamic/homepage/
dynamic/homepage/
http://www.sbu.ac.uk/dynamic
/homepage/
#homepagecompareURL()
To compare the URL extracted from the page with the URL saved in the "url.tmp" that has been searched. If there is no such URL in "url.tmp", then save the URL to it for further use.
3 Program
/* Program name: surfInternet
Date: 28/5/1997
Overview
This program allows the user to search the key word in the Internet.
It stores the URL of that web page in file "urlAddr.doc".
The user should input the WWW domain name, file name and key word
when he begins running the program.
Structure
1. The web page will be download and find key word according to your
input. If find the key word, the URL will be saved;
2. Extract the URL from the download page, to compare it with the old
URL to judge whether it is a new URL has never been searched or not;
3. If find the new URL, fork a new process to do (1+2);
4. If not find the new URL, continue step 2 till exit.
**Warning
1. Because the Internet is so big enough and the HTTP is the mainstream,
the program just searches the html file. It is easy to change
somewhere in the program to search other file, such as FTP, Gopher, etc.
2. The max number of process specified in UNIX account is 64. The fork
new process should be limited.
3. The max buffersize is 90,000. Because Internet has numerous URL, it
is easy to surpass that limitation.
4. Something wrong will be happened if the limitation is broken.
5. The keyword should be lower case.
*/
#include <stdio.h>
#include <stdlib.hh>
#include <string.hh>
#include <fcntl.hh>
#include <ctype.hh>
#include <sys/types.hh>
#include <sys/socket.hh>
#include <netdb.hh>
#include <netinet/in.hh>
#include <sys/time.hh>
#define SERVER_PORT 80 /* The default Webserver port */
#define BUFSIZE 100000 /* read buffer */
#define COMMAND 100 /* Size of GET command */
int openFile(char*);
char *buildURL(char*, char*, char*);
char *downLoadFindKey(int, char*, char*);
char *extractURL(char*, char*);
int compareURL(int, char*);
char *buildHTTPCommand(char*, char*);
char *writeAndReadAndFind(int, int, char*, char*, char*);
void main(int argc, char **argv) {
int fd1, fd2, num=0, pid, port, len, findNew=0;
char pbuf[BUFSIZE], keyWord[100], URLaddr[100], newURLaddr[100];
if (argc!=4) {
perror("<programh> server path\n");
exit(0);
}
strcpy(keyWord, argv[3]);
fd1=openFile("urlAddr.doc"); /*open document to store key word URL*/
fd2=openFile("url.tmp"); /*open a temp file to store search URL*/
strcpy(URLaddr, buildURL("http", argv[1], argv[2]));/*build a URL*/
initialUrlTmp(fd2, URLaddr); /*to store the first URL to temp file*/
begin:
strcpy(pbuf,downLoadFindKey(fd1, URLaddr,keyWord)); /*download page&find keyword*/
while(1) {
if(strstr(pbuf, "href") || strstr(pbuf, "HREF")) { /*find the URL*/
if(strstr(pbuf, "href")) {
strcpy(pbuf,strstr(pbuf, "href"));
} else {
strcpy(pbuf,strstr(pbuf, "HREF"));
}
strcpy(newURLaddr,extractURL(pbuf, URLaddr));/*extract the URL*/
findNew=compareURL(fd2, newURLaddr);/*compare the URL to find if it is new*/
if(findNew==1) { /*find the new URLaddr*/
num++;
if(numh>10) { /*limit the number of process*/
close(fd1);close(fd2);
exit(0);
}
pid=fork();
if(!(pid!=0)) { /*new process*/
strcpy(URLaddr,newURLaddr); /*change URL with the new URL*/
URLaddr[strlen(newURLaddr)]='\0';
goto begin; /*to return the begin to seach another page*/
}
}
else { /*not find the new URLaddr*/
strcpy(pbuf , strstr(pbuf, "h>")); /*to skip the current URL*/
pbuf[strlen(strstr(pbuf, "h>"))]='\0'; /*to assure buf page end*/
continue; /*loop again to find the next URL*/
}
}
else { /*there is no URL on the page*/
close(fd1); close(fd2); /*close the open file*/
break; /*break the loop*/
}
}
printf("exit!\n");
exit(0); /*to stop the process*/
}
/* To store the first URL address to the temp file */
initialUrlTmp(int fd2, char *URLaddr) {
int num;
char rURLaddr[100];
strcpy(rURLaddr, URLaddr);
strcat(rURLaddr, "\n");
num=strlen(rURLaddr);
if(write(fd2, rURLaddr, num) !=num) {
printf("Error on write.\n");
}
}
/* To create and open the given name file */
int openFile(char *filename) {
int fd;
fd=creat(filename,S_IREAD|S_IWRITE);
close(fd);
if ((fd=open(filename, O_WRONLY)) == -1) {
printf("Cannot open URL doc file.\n");
exit(1);
}
return(fd);
}
/* To build the URL address from their separate name */
char *buildURL(char *protocol,char *hostName,char *fileName) {
int n=0,m=0;
char URLaddrs[100];
strcpy(URLaddrs, protocol);
strcat(URLaddrs, "://");
strcat(URLaddrs, hostName);
strcat(URLaddrs, fileName);
/* to kick off the white space from the URL */
for(n=0; n<(strlen(URLaddrs)); n++) {
while(isspace(URLaddrs[n])) {
for(m=0; m<(strlen(URLaddrs)-n); m++) {
URLaddrs[n+m]=URLaddrs[n+m+1];
}
}
}
return(URLaddrs);
}
/* Compare the existing URL address */
int compareURL(int fd2, char *URLaddr) {
int fd1, findNew;
char buf[BUFSIZE], rURLaddr[102];
/* to add a return to the URL */
strcpy(rURLaddr, URLaddr);
strcat(rURLaddr, "\n");
/* to read the old URL from the temp file */
if ((fd1=open("url.tmp", O_RDONLY)) == -1) {
printf("Cannot open URL addr temp file.\n");
exit(1);
}
for(;;) {
if(read(fd1, buf, BUFSIZE)==0)
break;
}
close(fd1);
/* to compare the old URL */
if(strstr(buf, rURLaddr)) { /* find the old URL */
findNew=0;
return(findNew);
}
else { /* find the new URL */
if(write(fd2, rURLaddr, strlen(rURLaddr))!=strlen(rURLaddr)) {
printf("Error on write.\n");
}
findNew=1;
return(findNew);
}
}
/* Extract the URL address from the given web content */
char *extractURL(char *pbuf, char *URLaddr) {
unsigned int lenp;
char qbuf[BUFSIZE], newURLaddr[100],protocol[100], hostName[100], fileName[100];
/* to separate the URL to protocol, host, file name */
strcpy(protocol, URLaddr);
lenp=strcspn(protocol, "/");
*(protocol+lenp-1)='\0';
strcpy(hostName, strstr(URLaddr,"//"));
strcpy(hostName, strchr(hostName, (*(hostName+2))));
strcpy(fileName,strchr(hostName, '/'));
lenp=strcspn(hostName,"/");
*(hostName+lenp)='\0';
/* to get the whole URL in the quote */
strcpy(qbuf, strchr(pbuf, '"')); /*how about JavaScript ' */
strcpy(qbuf, strchr(qbuf, (*(qbuf+1))));
lenp = strcspn(qbuf, "\"");
*(qbuf+lenp)='\0';
/* to get the new protocol name */
lenp=strcspn(qbuf, ":"); /*if not, return string length*/
if(lenp<strlen(qbuf)) { /*exist protocol*/
strcpy(protocol, qbuf);
*(protocol+lenp)='\0';
strcpy(qbuf,strchr(qbuf,':'));
strcpy(qbuf,strchr(qbuf, (*(qbuf+1)))); /*URL qbuf begin after ':'*/
}
/* if the protocol is not http, it will skip the new address */
if((strcmp(protocol,"http"))) {
strcpy(newURLaddr, URLaddr);
goto loo2;
}
/* to get the host name */
if(strstr(qbuf, "//")) { /*URL qbuf begin with '//'*/
strcpy(qbuf,strstr(qbuf, "//"));
strcpy(qbuf,strchr(qbuf, (*(qbuf+2)))); /*URL qbuf begin not include '//'*/
lenp=strcspn(qbuf, "/");
if(lenp<strlen(qbuf)) { /*exist file name*/
strcpy(hostName, qbuf);
(*(hostName+lenp))='\0';
strcpy(qbuf,strchr(qbuf, '/')); /*URL qbuf begin with '/'*/
}
else {
strcpy(fileName, "/"); /*default file name*/
goto loo1; /*no file name*/
}
}
/* to get the file name */
lenp=strcspn(qbuf, "/");
if(strlen(qbuf)h>1 && ((*qbuf)!='#')) { /*exist the file name*/
if(lenp!=0) { /* filename not begin '/' */
lenp=strlen(fileName)-strlen(strrchr(fileName, '/'));
*(fileName+lenp+1)='\0'; /* plus is a '/' */
strcat(fileName, qbuf); /*exist the same directory*/
}
else {
strcpy(fileName, qbuf); /*exist the same hostName and different file name*/
}
}
else {
strcpy(fileName, "/"); /*default file name*/
}
loo1:
strcpy(newURLaddr,buildURL(protocol, hostName, fileName));
// printf("find the newfile addr=%s \n", newURLaddr);
loo2:
return (newURLaddr);
}
/* Down load the given URL web page and search the key word, if find, save URL */
char *downLoadFindKey(fd1, URLaddr,keyWord)
int fd1;
char *URLaddr, *keyWord;
{
int port=80, s,lenp;
char *message,pbuf[BUFSIZE], protocol[100], hostName[100], fileName[100];
/* to separate the URL to protocol, host, file name */
strcpy(protocol, URLaddr);
lenp=strcspn(protocol, "/");
*(protocol+lenp-1)='\0';
strcpy(hostName, strstr(URLaddr,"//"));
strcpy(hostName, strchr(hostName, (*(hostName+2))));
strcpy(fileName,strchr(hostName, '/'));
lenp=strcspn(hostName,"/");
*(hostName+lenp)='\0';
s=requestConnection(hostName,port);
message=buildHTTPCommand("GET",fileName);
strcpy(pbuf, writeAndReadAndFind(s,fd1,message,keyWord,URLaddr));
return(pbuf);
}
/*Build the required HTTP command using the given file path*/
char *buildHTTPCommand(char *http, char *path) {
char tmpCmd[COMMAND];
strcpy(tmpCmd,http);
strcat(tmpCmd," ");
strcat(tmpCmd,path);
strcat(tmpCmd,"\n");
return(tmpCmd);
}
/* Client requests a connection with a server on the given port*/
int requestConnection(char *serverMachine,int serverPort) {
int s,n;
struct sockaddr_in serverSocketName;
s=socket(AF_INET,SOCK_STREAM,0);
makeDestSA(&serverSocketName,serverMachine,serverPort);
printSA(serverSocketName);
if (connect(s,(struct sockaddr *)&serverSocketName,sizeof(struct sockaddr_in))<0) {
perror("requestConnection: Connect failed");
exit(0);
}
return s;
}
/* Send a message to the specified socket + get a reply */
/*Note that printing the buffer in this manner is NOT 100% */
char *writeAndReadAndFind(s,fd1,message,keyWord,URLaddr)
char *message, *keyWord, *URLaddr;
int s,fd1;
{
int n,m=0;
char *buf[BUFSIZE], catbuf[BUFSIZE], tmpbuf[BUFSIZE], rURLaddr[100];
memset(catbuf, '\0', BUFSIZE);
memset(buf, '\0', BUFSIZE);
if ((n=write(s,message,strlen(message))),0)
perror("writeAndRead:Write()");
anyThingThere(s);
do {
if ((n=read(s,buf,BUFSIZE))<0)
perror("writeAndRead: Receive");
else {
buf[n]="\0";
strcat(catbuf, buf);
m+=n;
}
} while(nh>0);
if (mh>0) {
catbuf[m]='\0';
/* printf("writeAndRead: TOTAL %d bytes.\n",m);*/
}
close(s);
/* Find a key word in the buf and store the URLaddr */
strcpy(tmpbuf, catbuf);
n=0;
while(n<strlen(tmpbuf)) {
tmpbuf[n]= tolower(tmpbuf[n]);
n++;
}
if(strstr(tmpbuf, keyWord)) {
strcpy(rURLaddr, URLaddr);
strcat(rURLaddr, "\n");
if(write(fd1, rURLaddr, strlen(rURLaddr))!=strlen(rURLaddr)) {
printf("Error on write.\n");
exit(1);
}
printf("Find the \"%s\" at %s\n", keyWord, URLaddr);
}
return(catbuf);
}
/*Check the buffer for input */
anyThingThere(s)
int s;
{
unsigned long read_mask;
struct timeval timeout;
int n;
timeout.tv_sec=10; /*Seconds*/
timeout.tv_usec=0; /*Microseconds*/
read_mask=(1<<s);
if ((n=select(32,&read_mask,0,0,&timeout))<0)
perror("anyThingThere: Select fail:\n");
// else
// printf("anyThingThere: n = %d\n");
}
/* Initialise the socket-address */
makeDestSA(sa,hostname,port)
struct sockaddr_in *sa;
int port;
char *hostname;
{
struct hostent *host;
sa-h>sin_family=AF_INET;
host=gethostbyname(hostname);
sa-h>sin_addr=*(struct in_addr *)(host-h>h_addr);
sa-h>sin_port=htons(port);
}
/* Print out the contents of a sockaddr_in structure */
printSA(sa)
struct sockaddr_in sa;
{
// printf("Socket Address = %d, %s, %d\n",sa.sin_family,
// inet_ntoa(sa.sin_addr), ntohs(sa.sin_port));
}