/***************************************************************************\ * webgrab - v1.3 - Copyright 1995, Brian J. Swetland * * * * - initial version by Brian Swetland * * - cleaned up a bit by Brandon Long * * - proxy support by Kristin Buxton * * - cleaned up more by Brian Swetland * * * * bcl version modified by Brandon Long * * - support the WWW_PROXY env var * * - support for non http:// urls through a proxy * * - support for not having to use a protocol, assume http: * * * * Free for any personal or non-commercial use. * * Use at your own risk. If you like it, buy the authors a pizza. * \***************************************************************************/ #define VERSION "1.3bcl" #include #include #ifdef __bsdi__ # include #else # ifndef NeXT # include # endif #endif #include #include #include #include #include #include #include /* strdup isn't portable, so we make our own. */ char *strd(char *s) { char *d; d = (char *) malloc(strlen(s) + 1); strcpy(d,s); return(d); } /* parses URL looking like blah://host[:port][/path] will ignore anything before the first : and terminate path when it hits >, ", or whitespace -- returns portno or 0 if bad url */ int parseURL(char *url, char **protocal, char **host, char **path) { char *p, *pp; int port; p = url; /* skip anything up to the first : (the one after http, etc) */ while(*p && *p!=':') p++; if(!*p) { p = url; /* assume http:// */ *protocal = "http"; } else { /* REQUIRE two '/'s */ if(!(*(++p) && (*p =='/') && *(++p) && (*p == '/'))) return 0; *protocal = (char *) malloc(p - url); strncpy(*protocal,url,p-url-2); (*protocal)[p-url-1]='\0'; p++; } /* mark the beginning of the hostname */ pp = p; /* hostname is terminated by a '/' or '>','"',or whitespace */ while(*p && *p!=':' && *p!='/' && *p!='"' && *p!='>' && !isspace(*p)) p++; *host = (char *) malloc(p-pp+1); strncpy(*host,pp,p-pp); (*host)[p-pp]='\0'; /* optionally read a portnumber */ if(*p==':'){ p++; port = 0; while(*p && isdigit(*p)){ port = port*10 + (*p-'0'); p++; } if(!*p || *p!='/') { free(*host); return 0; } } else { port = 80; } /* still more */ if(*p && (*p=='/')){ pp = p; while(*p && *p!='"' && *p!='>' && !isspace(*p)) p++; *p = 0; *path = strd(pp); } else { *path = strd("/"); } return port; } void usage(char *argv) { printf("\nWebgrab: The Command Line Browser\tVersion %s \n",VERSION); printf("Usage: %s [-shrpd] [] \n",argv); printf(" -s Suppress Headers\n"); printf(" -h Headers Only\n"); printf(" -r Read HTTP headers from stdin\n"); printf(" -d Do nothing\n"); printf(" -p Next argument is \n"); printf(" HTTP Proxy Host ( hostname[:port] format )\n"); printf(" URL to retrieve (in http:// format)\n\n"); exit(1); } int main(int argc, char *argv[]) { int s, i, port, pport; struct sockaddr_in sa; struct hostent *hp; FILE *fpo,*fpi; char buf[1024]; char *path,*host,*p; char *protocal = NULL; char *proxy = NULL; char *url = NULL; /* operational flags */ int ignore=0,head=0,readin=0,proxynext=0,urlmunge=0,verbose=0; proxy = getenv("WWW_PROXY"); for(i = 1; i < argc; i++){ if(proxynext){ /* this arg is our proxy */ proxy = argv[i]; proxynext = 0; continue; } if(argv[i][0]=='-'){ for(path=&argv[1][1];*path;path++){ switch(*path){ case 'r': readin = 1; break; case 's': ignore = 1; break; case 'h': head = 1; break; case 'v': verbose = 1; break; case 'p': if(proxy || proxynext) usage(argv[0]); proxynext = 1; break; default: usage(argv[0]); } } continue; } /* must be a url */ if(url) usage(argv[0]); url = argv[i]; } if(proxynext || !url) usage(argv[0]); /* find the server */ if(proxy){ pport = 80; p = proxy; /* look for a portnum */ while(*p){ if(*p==':'){ *p=0; p++; pport = atoi(p); break; } p++; } if(!(hp = gethostbyname(proxy))) { fprintf(stderr,"error: can't get proxy %s.\n",proxy); exit(1); } if (!(port=parseURL(url, &protocal, &host, &path))) { fprintf(stderr,"%s: url parse failed, trying proxy anyways\n",argv[0]); urlmunge = 1; } } else { if (!(port=parseURL(url, &protocal, &host, &path))) { fprintf(stderr,"error: invalid url\n"); exit(1); } if (strcmp(protocal,"http")) { fprintf(stderr,"%s: error: webgrab only supports http without a proxy\n",argv[0]); exit(1); } if(!(hp = gethostbyname(host))) { fprintf(stderr,"error: can't get host %s.\n",host); exit(1); } } if (verbose) { fprintf(stderr, "Webgrab: The Command Line Browser\tVersion %s \n",VERSION); if (proxy) fprintf(stderr,"Proxy: %s:%d\n",proxy,pport); fprintf(stderr, "Request: %s ",head?"HEAD":"GET"); if (urlmunge) fprintf(stderr,"%s ",url); else fprintf(stderr,"%s://%s:%d%s ",protocal,host,port,path); fprintf(stderr,"HTTP/1.0\n"); fprintf(stderr,"\n"); } /* Setup the socket */ memset(&sa, 0, sizeof(sa)); sa.sin_port = htons(proxy ? pport : port); memcpy((char *)&sa.sin_addr, (char *)hp->h_addr, hp->h_length); sa.sin_family = hp->h_addrtype; /* allocate the socket */ if((s = socket(hp->h_addrtype, SOCK_STREAM, 0)) < 0){ fprintf(stderr,"error: can't get socket\n"); exit(1); } /* connect to the server */ if(connect(s, &sa, sizeof(sa)) < 0){ close(s); fprintf(stderr,"error: can't connect\n"); exit(1); } fpo = fdopen(s,"w"); fpi = fdopen(s,"r"); if(proxy){ if (urlmunge) fprintf(fpo,"%s %s HTTP/1.0\r\n",head?"HEAD":"GET",url); else fprintf(fpo,"%s %s://%s:%d%s HTTP/1.0\r\n",head?"HEAD":"GET", protocal,host,port,path); } else { fprintf(fpo,"%s %s HTTP/1.0\r\n",head?"HEAD":"GET",path); } if (readin) { /* copy headers from stdin ... */ while(!feof(stdin)){ i = fread(buf,1,1024,stdin); if(i) fwrite(buf,1,i,fpo); if(feof(stdin)) break; } } else { /* send our normal header info */ fprintf(fpo, "User-Agent: WebGrab/%s (commandline forever)\r\n", VERSION); } fputs("\r\n",fpo); fflush(fpo); /* handle headers */ while(!feof(fpi)){ fgets(buf,1024,fpi); if(!ignore) fprintf(stdout,"%s",buf); if(feof(fpi) || buf[0]<' ') break; } while(!feof(fpi)){ i = fread(buf,1,1024,fpi); if(i) fwrite(buf,1,i,stdout); if(feof(fpi)) break; } close(s); exit(0); }