#line 2 "guess.c"
/*-
 * C-SaCzech
 * Copyright (c) 1996-2002 Jaromir Dolecek <dolecek@ics.muni.cz>
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions
 * are met:
 * 1. Redistributions of source code must retain the above copyright
 *    notice, this list of conditions and the following disclaimer.
 * 2. Redistributions in binary form must reproduce the above copyright
 *    notice, this list of conditions and the following disclaimer in the
 *    documentation and/or other materials provided with the distribution.
 * 3. All advertising materials mentioning features or use of this software
 *    must display the following acknowledgement:
 *	This product includes software developed by Jaromir Dolecek
 *	for the CSacek project.
 * 4. The name of Jaromir Dolecek may not be used to endorse or promote
 *    products derived from this software without specific prior written
 *    permission.
 *
 * THIS SOFTWARE IS PROVIDED BY JAROMIR DOLECEK ``AS IS'' AND ANY EXPRESS OR
 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
 * IN NO EVENT SHALL JAROMIR DOLECEK BE LIABLE FOR ANY DIRECT,
 * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
 * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
 * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 */

/* $Id: guess.c,v 1.12 2002/02/03 11:13:41 dolecek Exp $ */

#include "csacek.h"

/*
 * Tries to guess encoding and language supported by remote client; uses
 * ``Accept-Charset'' and ``Accept-Charset'' headers, if client has sent them.
 * If it's not the case, ISO-8859-2 is forced for people from Czech Republic,
 * Slovakia, Hungary and Poland, and ASCII for others.
 * This function is only called if the output encoding is NOT stated
 * explicitly (in URL).
 */
void
csa_toguess(p, redirect_client)
  csa_params_t *p;
  int redirect_client;
{
	const char *charset=NULL, *suffix=NULL, *tocut, *prefix;
	char *newurl;
	const csa_String *strp;
	cstools_t outenc=CSTOOLS_UNKNOWN;

#ifdef CSA_DEBUG
	csa_debug(p->dbg, "csa_toguess: called");
#endif

	/* always vary by Accept-Language */
	CSA_SET(p->flags, CSA_FL_VARY_AL);

	/* test language preferences of client */
	strp = csa_getheaderin(p, "Accept-Language");
	if (strp) {
		if (csa_strcasestr(strp->value, "cz")
		    || csa_strcasestr(strp->value, "cs")
		    || csa_strcasestr(strp->value, "sk"))
		{
			suffix = ".cs";
			outenc = CSTOOLS_ISOLatin2;
		} else if (csa_strcasestr(strp->value, "hu")
			|| csa_strcasestr(strp->value, "pl"))
		{
			suffix = ".en";
			outenc = CSTOOLS_ISOLatin2;
		} else if (csa_strcasestr(strp->value, "en"))
			suffix = ".en";
	}

	/*
	 * Always Vary by Accept-Charset - we provide generally different
	 * encodings for different values of Accept-Charset. If client
	 * doesn't sent Accept-Charset and the response happens to be
	 * used by proxy cache, we want other clients with different
	 * Accept-Charset to use document in different encoding.
	 */
	CSA_SET(p->flags, CSA_FL_VARY_AC);

	/*
	 * Test charset preferences of client. This overrides outenc
	 * setting from above if Accept-Charset has been sent by client.
	 */
	if ((strp = csa_getheaderin(p, "Accept-Charset")) != NULL)
	{
		cstools_t code;
		size_t skipped;
		const char *value = strp->value, *tmp;
		int maxq=-2000, q;

		while(value && value[0]) {
			value += strspn(value, " \t");
			skipped = strcspn(value, " \t;,");

			/* if the token is "*", use iso-8859-2 */
			if (skipped == 1 && value[0] == '*') {
				outenc = CSTOOLS_ISOLatin2;
				break;
			}

			code = cstools_whichcode(value, skipped);
			value += skipped + 1;
			if (code == CSTOOLS_UNKNOWN) {
				goto out;
			}

			/* check for qualifier */
			q = 1000;
			if (value[-1] == ';') {
				value += strspn(value, " \t");
				if (strncasecmp(value, "q=", 2) == 0)
					q = (int)(atof(value + 2) * 1000);
			}

			/* apply CSacek preferences */
			switch (code) {
			case CSTOOLS_ISOLatin2:
				/* no change */;
				break;
			case CSTOOLS_CP1250:
				q -= 500;
				break;
			case CSTOOLS_ASCII:
				q -= 999;
				break;
			default: /* other code sets */
				q -= 100;
				break;
			}
			
			if (q > maxq) {
				outenc = code;
				maxq = q;
			}

		    out:
			if (value[-1] != ',') {
				tmp = strchr(value, ',');
				value = (tmp) ? tmp + 1 : NULL;
			}
		}
	}

	/*
	 * Netscape 4.0 for Mac sends Accept-Charset header with
	 * iso-8859-1 as the only charset (even through it supports
	 * iso-8829-2 as well), so it's unusable and we
	 * have to check the country of remote client anyway.
	 */
	if (outenc == CSTOOLS_ISOLatin1) {
		const csa_String *ua = csa_getheaderin(p, "User-Agent");

		if (ua && strncasecmp(ua->value, "Mozilla/4.", 10) == 0
		    && csa_strcasestr(ua->value, "MAC"))
			outenc = CSTOOLS_UNKNOWN;

		/* used User-Agent header, have to send appropriate Vary */
		CSA_SET(p->flags, CSA_FL_VARY_UA);
	}

	/*
	 * If we do not know language preferences of client or still
	 * don't know charset preferences, use name of remote host
	 * to guess which charset & language it should support.
	 */
	if (!suffix || outenc == CSTOOLS_UNKNOWN) {
		const char *suff;
		const csa_String *remote_host = csa_getvar(p, "REMOTE_HOST");

		/* if REMOTE_HOST is an IP address, find name */
		if (atoi(remote_host->value)) {
			const char *name;
			name = csa_gethostbyaddr(p->pool_req,
					remote_host->value);
			if (name) {
				csa_setvar(p, "REMOTE_HOST", name, 0);
				remote_host = csa_getvar(p, "REMOTE_HOST");
			}
		}
		
		/* browsers of people from Czech Republic, Slovakia,
		 * Poland and Hungary should support iso-8859-2 */
		if ((suff = csa_has_suffix(remote_host->value,
				".cz,.sk,.pl,.hu", ',')) != NULL)
		{
			if (outenc == CSTOOLS_UNKNOWN)
				outenc = CSTOOLS_ISOLatin2;
			if (!suffix && (strcasecmp(suff, ".cz") == 0
					    || strcasecmp(suff, ".sk") == 0))
				suffix = ".cs";
		}
	}

	/* if outenc & suffix is still not set, set it to ASCII & en */
	if (!suffix)
		suffix	= ".en";
	if (outenc == CSTOOLS_UNKNOWN)
		outenc = CSTOOLS_ASCII;

	if (!redirect_client) {
		/* if we will continue with guessed values (instead of */
		/* redirecting the client), update request structure   */

		p->outcharset = outenc;

		if (!*p->part.value) {
			csa_fillstring(&p->part, suffix, -1, -1);
			if (strcasecmp(suffix+1, p->dd) == 0)
				CSA_SET(p->flags, CSA_FL_PART_IS_DEF);
			else
				CSA_UNSET(p->flags, CSA_FL_PART_IS_DEF);
		}

		return;
	}

	prefix = csa_getvar(p, "SCRIPT_NAME")->value;

	/* get the part before GUESS or __CHARSET__ out of SCRIPT_NAME and
	 * preserve any suffix the script name might have */
	tocut = csa_strcasestr(prefix, "GUESS");
	if (!tocut) tocut = strstr(prefix, "__CHARSET__");
	if (tocut) {
		prefix = ap_pstrndup(p->pool_tmp, prefix,
			tocut - prefix);
		tocut = strchr(tocut, '.');
		if (tocut) suffix = tocut;
	}

	/* don't use part name if it's same as default partname */
	if (strcasecmp(suffix+1, p->dd) == 0)
		suffix += 1 + strlen(p->dd);
	
	/* get CSacek name for the guessed encoding */
	charset	= cstools_name(outenc, CSTOOLS_TRUENAME);

	/* create redirect link */
	newurl = (char *) csa_alloca(
		strlen(prefix) + strlen(charset) + strlen(suffix) + 1,
		p->pool_tmp);

	sprintf(newurl, "%s%s%s", prefix, charset, suffix); /* safe */
	newurl = csa_construct_url(p, newurl, NULL);

	csa_setheaderout(p, "Status", "302 Moved Temporarily", 0);
	csa_setheaderout(p, "Location", newurl, CSA_I_COPYVALUE);
}
