/* ------------------------------------------------------------------- */ /* htget : Fetch a file using HTTP protocol */ /* */ /* Author : Ole Husby, BIBSYS */ /* Updated : 1998-09-30 */ /* */ /* ------------------------------------------------------------------- */ /* */ /* htget(url, type, timeout_seconds, outfile, content_type, location) */ /* */ /* Returns: HTTP statuscode, with additional private: */ /* 0 : OK ( = 200) */ /* 900 : Error, possible timeout */ /* 901 : Syntax error in url */ /* 902 : Unknown host */ /* 903 : No response from server (no connection) */ /* 904 : File is not text/html */ /* 905 : Statusline > 255 bytes */ /* 906 : Statusline < 4 bytes */ /* 907 : Statusline not starting with "HTTP" */ /* 908 : Statuscode not numeric */ /* 909 : Size of header > BUFSIZE */ /* 910 : Unable to open output file */ /* 999 : Unspecified TCP/IP error */ /* */ /* Writes to outfile, depending on type, if statuscode = 0 | 200 : */ /* */ /* type = 0 : Nothing */ /* type = 1 : HTTP header */ /* type = 2 : HTTP header + entitybody */ /* type = 3 : HTTP entitybody */ /* type = 4 : HTTP entitybody if text/html */ /* type = 5 : HTTP part of entitybody if text/html */ /* type = 6 : HTTP part of entitybody if text/html */ /* HTTP entitybody if application/marc */ /* */ /* ------------------------------------------------------------------- */ #include #include #include #include #include #include #include #include #include #include #include #define FALSE 0 #define TRUE 1 #define BUFSIZE 10000 #define TYPE_NONE 0 #define TYPE_HTTPHEAD 1 #define TYPE_HTTPALL 2 #define TYPE_HTTPBODY 3 #define TYPE_HTMLALL 4 #define TYPE_HTMLHEAD 5 #define TYPE_MARC 6 #define TRACE 0 #define AGENT "BIBSYS_htget v1.1" char conType[128]; int htmlonly, marcrecord; void thandler(int i) { } /* ------------------------------------------------------------------- */ /* geteoHEAD: Look for or or */ /* immediately before "); if (p) { i = p - buf + 7; buf[i] = '\n'; buf[i+1] = '\0'; return 1; } return 0; } /* ------------------------------------------------------------------- */ /* writeRequest: send request to server */ /* */ /* returns: Number of bytes written */ /* ------------------------------------------------------------------- */ int writeRequest(char *req, int server) { if (TRACE) printf("*** send(): %s\n", req); return write(server, req, strlen(req)); } /* ------------------------------------------------------------------- */ /* getBody : Read the Entity-body into the file filename */ /* */ /* Open the given file for writing, read data from the */ /* socket until a terminating '\0' is found, write to */ /* the file. Returns 0 if ok, positive if an error results */ /* in errno being set, or -1 if other error. */ /* Single read()'s blocking for more than TIMEOUT_SECONDS will */ /* be interrupted. The read() then returns a negative value, and */ /* errno will be set appropriately (EINTR). */ /* */ /* Returns 0 if ok */ /* 900 if read error */ /* */ /* ------------------------------------------------------------------- */ int getBody(int server, int timeout, int fd, char *filename) { int i, ef; unsigned char *bf, buf[BUFSIZE + 1]; int bytecount; int found_end; if (TRACE) printf("*** Read entitybody\n"); /* Loop until the endmark is found */ found_end = FALSE; while (!found_end) { alarm (timeout); bytecount = read (server, buf, BUFSIZE); alarm (0); if (bytecount < 0) return 900; /* error in read() */ else if (bytecount == 0) break; /* server closed socket */ else { if (buf[bytecount-1] == '\0') { bytecount--; /* do not write the '\0' to file */ found_end = TRUE; /* terminate the loop */ } /* Write to file */ if (bytecount > 0) { bf = (unsigned char *) buf; bf[bytecount] = '\0'; if ( htmlonly ) { ef = geteoHEAD(bf); bytecount = strlen(bf); } else ef = 0; write (fd, bf, bytecount); if (ef) break; } } } return 0; } /* ------------------------------------------------------------------- */ /* getHeader : Access file and read HTTP header */ /* */ /* Returns 0 if ok */ /* 904 if Content-type not "text/html" */ /* 905 if Statusline > 255 bytes */ /* 906 if Statusline < 4 bytes */ /* 907 if Statusline not starting with "HTTP" */ /* 908 if Statuscode not numeric */ /* 909 if size of header > BUFSIZE */ /* HTTP statuscode if <> 200 */ /* */ /* ------------------------------------------------------------------- */ int getHeader(int server, int timeout, int type, int fd, char *reason, char *loc) { char buf[BUFSIZE+1], dummy[8]; unsigned char *p, *cp = buf, *d = dummy; int i, rc, statuscode, plf, rlen = 0; reason = (char *) NULL; *conType = 0; if (TRACE) printf("*** Read statusline\n"); /* Read HTTP statusline (until LF or 0, max 256 byte) */ for ( cp = buf, *buf = 0; 1; cp++ ) { alarm (timeout); rc = read ( server, cp, 1 ); alarm (0); if ( rc < 0 ) return 900; if ( *cp == '\r') { cp--; continue; } rlen++; if (TRACE) printf("%c", *cp); if ( *cp == 0 || rlen > 255) return 905; if ( *cp == '\n') { *cp = 0; break; } } /* Write statusline */ if ( ( type == TYPE_HTTPHEAD ) || ( type == TYPE_HTTPALL ) ) { write(fd, buf, strlen(buf)); write(fd, "\n\n", 2); } if (TRACE) printf("*** read() : (%d) %s\n", rlen, buf); /* Parse statusline */ if (rlen < 4) return 906; if (strncmp(buf, "HTTP", 4) != 0) return 907; p = strtok(buf, " "); p = strtok(NULL, " "); if (!*p) statuscode = 200; else { for (i = 0; i < strlen(p); i++) if (!isdigit(p[i])) return 908; } statuscode = atoi(p); p = strtok(NULL, "\0"); if (p) reason = p; if (statuscode == 200) statuscode = 0; if (!type) return statuscode; /* Read HTTP response header (until 0 or empty line, max BUFSIZE bytes */ rlen = 0; plf = FALSE; if (TRACE) printf("*** Read responseheader\n"); for ( cp = buf, *buf = 0; 1; cp++ ) { alarm (timeout); rc = read ( server, cp, 1 ); alarm (0); if ( rc < 0 ) return 900; if ( *cp == '\r') { cp--; continue; } rlen++; if ( *cp == 0 || rlen > BUFSIZE) return 909; else if ( *cp == '\n') { if (plf) { *cp = 0; break; } else plf = TRUE; } else plf = FALSE; } /* Write rest of HTTP header */ if ( ( type == TYPE_HTTPHEAD ) || ( type == TYPE_HTTPALL ) ) { write(fd, buf, strlen(buf)); write(fd, "\n", 1); } if (TRACE) printf("*** read() : (%d) %s\n", rlen, buf); /* Parse header for Content-Type and Loaction */ rc = 904; p = strtok(buf, "\n"); while (p) { if (strncasecmp(p, "Content-Type:", 13) == 0) { p += 13; while (p[0] == ' ') p++; strcpy(conType, p); if (strncasecmp(p, "text/html", 7) == 0) { if ( ( type == TYPE_HTMLHEAD) || (type == TYPE_MARC ) ) htmlonly = 1; rc = 0; } else if (strncasecmp(p, "application/marc", 16) == 0) { if ( type == TYPE_MARC ) { marcrecord = 1; rc = 0; } } } else if (strncasecmp(p, "Location:", 9) == 0) { p += 9; while (p[0] == ' ') p++; strcpy(loc, p); } p = strtok(NULL, "\n"); } /* All OK. Socket is positioned at start of HTTP Entity-Body */ if (rc) return rc; else return statuscode; } /* ------------------------------------------------------------------- */ /* htget : Fetch a URL */ /* ------------------------------------------------------------------- */ int htget(char *iurl, int type, int timeout, char *outfile, char *h_contype, char *h_location) { int i, rc, fd, soc, port; struct sockaddr_in addr; struct hostent *hp, *gethostbyname(); char uurl[1024], *url = uurl, hostname[256], cport[64], req[1024]; char *h, *p, *q, *r; char tfile[256], blank[2]; *blank = *h_contype = *h_location = 0; htmlonly = marcrecord = 0; strcpy(tfile, "/tmp/geturl.tmp"); if (!*outfile) outfile = (char *) tfile; if (!*iurl || ( strlen(iurl) > 1023 ) ) return 901; strcpy(url, iurl); /* Parse and validate URL */ if (strncmp(url, "http://", 7) != 0) return 901; url += 7; q = strtok(url, "/"); if (!q) return 901; r = strtok(NULL, "\0"); if (!r) r = (char *) blank; h = strtok(q, ":"); if (!*h) return 901; strcpy(hostname, h); p = strtok(NULL, "\0"); if (!p || !*p) port = 80; else { for (i = 0; i < strlen(p); i++) if (!isdigit(p[i])) return 901; port = atoi(p); } sprintf(req, "GET /%s HTTP/1.0\r\nUser_Agent: %s\n\n", r, AGENT); /* Establish handler for the alarm-signal */ signal (SIGALRM, thandler); /* Get IP address */ hp = gethostbyname(hostname); if (!hp) return 902; /* Get socket and connect */ soc = socket(AF_INET, SOCK_STREAM, 0); addr.sin_family = AF_INET; memcpy( &addr.sin_addr.s_addr, hp->h_addr, (size_t) hp->h_length ); addr.sin_port = htons(port); if (connect(soc, (struct sockaddr *) &addr, sizeof(struct sockaddr_in)) < 0) return 903; /* Open the destination file */ if (type) { fd = open(outfile, O_WRONLY | O_CREAT | O_TRUNC , 0666); if (fd < 0) { close(soc); return 910; } } /* Write HTTP-request */ if (!writeRequest(req, soc)) { close(soc); if (type) close(fd); return 999; } /* Read header part of response */ rc = getHeader(soc, timeout, type, fd, r, h_location); if ( ( rc == 904 ) && (type < 4 ) ) rc = 0; if ( ( rc == 0 ) && ( type > 1 ) ) rc = getBody(soc, timeout, fd, outfile); close(soc); if (type) close(fd); strcpy(h_contype, conType); return rc; }