HOWTO: ISAPI Filter logging request URL and headers based on User-Agent
A user recently reported on the microsoft.public.inetserver.iis NNTP newsgroup seeing MSNBot causing lots of 406s while indexing static files (like HTML) on his IIS web server. Alas, there is no built in feature in IIS to take a snapshot of a request which match certain criteria (in this case, a request whose User-Agent indicates MSNBot, and we want all the request headers, especially the Accept: header). So, it motivated me to whip together the following ISAPI Filter to do the job.
Question:
I've been having this problem with the MSNbot for several months.
Most requests that it sneds end up with IIS sending back a 406 error.
I've read that a 406 is usually caused by the request headers missing an accept-type header.
Is there anything I can do on IIS v5 (Win2K) to correct this problem with these 406 errors? I think it's really hurting my MSN search engine placement
Answer:
Unfortunately, there is no built-in functionality in IIS that can take a snapshot of the request given some criteria. However, IIS extensibility allows this functionality to be added, as the following ISAPI Filter illustrates.
After installing this filter (either globally to apply to all requests [restart IIS required] or per-site [no restart required]), this filter basically:
- Looks at the User-Agent: request header and see if it starts with "msnbot/"
- If the request is from a client claiming to be the MSNBot, then record the requested URL+Querystring as well as all headers of the request into a log file which is per-PID. Otherwise, do nothing
- The name of the logfile defaults to: C:\inetpub\wwwroot\LogFile.<PID>.log
Thread-safe Logging
In addition to the "usual" sorts of ISAPI Filter behavior, this filter also illustrates a correct way to do something as common as logging while accounting for one of the potential breaking changes in ISAPI Filter behavior from IIS5 to IIS6: ISAPI Filters are no longer singletons on the system.
As always, an ISAPI Filter must log data to a single shared resource (log file) in a thread-safe manner. However, an ISAPI Filter on IIS6 must also be aware that in IIS6 Worker Process Isolation Mode, multiple w3wp.exe can simultaneously run the ISAPI Filter DLL - thus, any process-wide synchronization mechanisms need to be scaled across processes.
- For thread-safety - a critical section controls access to process-wide shared resource like the log file to prevent corruption of it's content by multiple threads running through a given ISAPI Filter DLL. Remember, the ISAPI Filter API may be synchronous, but IIS is multi-threaded so multiple threads could be executing in a synchronous manner through the same Filter module in the same process.
- For cross-process safety - the log filename includes the PID of the process that is running the ISAPI Filter DLL so that threads running through ISAPI Filter DLLs loaded in DIFFERENT processes are prevented from simultaneously accessing the same log file. Critical sections will not be able to control access because they are per-process, and global Mutex is too heavy handed because multiple threads in multiple processes in multiple application pools should be able to independently and simultaneously log data.
Enjoy.
//David
#include <windows.h>
#include <httpfilt.h>
#include <stdio.h>
#define DEFAULT_BUFFER_SIZE 1024
#define MAX_BUFFER_SIZE 4096
#define LOG_FILENAME "C:\\inetpub\\wwwroot\\LogFile"
HANDLE g_hLogFile = INVALID_HANDLE_VALUE;
CRITICAL_SECTION g_LogFileLock;
DWORD
OnPreprocHeaders(
HTTP_FILTER_CONTEXT * pfc,
HTTP_FILTER_PREPROC_HEADERS * pPPH
);
BOOL
WINAPI
GetFilterVersion(
HTTP_FILTER_VERSION * pVer
)
/*++
Purpose:
Required entry point for ISAPI filters. This function
is called when the server initially loads this DLL.
Arguments:
pVer - Points to the filter version info structure
Returns:
TRUE on successful initialization
FALSE on initialization failure
--*/
{
CHAR szLogFile[ MAX_PATH ];
//
// Initialize the logging critical section
//
InitializeCriticalSection( &g_LogFileLock );
//
// TODO: Choose a location where IIS process identity can write to
//
sprintf( szLogFile,
"%s.%d.log",
LOG_FILENAME,
GetCurrentProcessId() );
//
// Initialize the logfile name
//
g_hLogFile = CreateFile(
szLogFile,
GENERIC_WRITE,
FILE_SHARE_READ | FILE_SHARE_WRITE,
NULL,
OPEN_ALWAYS,
FILE_ATTRIBUTE_NORMAL,
NULL
);
if ( g_hLogFile == INVALID_HANDLE_VALUE )
{
//
// Failed to open log file due to GetLastError(). Bail.
//
return FALSE;
}
pVer->dwFilterVersion = HTTP_FILTER_REVISION;
lstrcpyn( pVer->lpszFilterDesc,
"ISAPI Filter to log request headers based on User-Agent",
SF_MAX_FILTER_DESC_LEN );
pVer->dwFlags =
SF_NOTIFY_ORDER_HIGH |
SF_NOTIFY_PREPROC_HEADERS
;
return TRUE;
}
DWORD
WINAPI
HttpFilterProc(
IN HTTP_FILTER_CONTEXT * pfc,
DWORD dwNotificationType,
LPVOID pvNotification
)
/*++
Purpose:
Required filter notification entry point. This function is called
whenever one of the events (as registered in GetFilterVersion) occurs.
Arguments:
pfc - A pointer to the filter context for this notification
NotificationType - The type of notification
pvNotification - A pointer to the notification data
Returns:
One of the following valid filter return codes:
- SF_STATUS_REQ_FINISHED
- SF_STATUS_REQ_FINISHED_KEEP_CONN
- SF_STATUS_REQ_NEXT_NOTIFICATION
- SF_STATUS_REQ_HANDLED_NOTIFICATION
- SF_STATUS_REQ_ERROR
- SF_STATUS_REQ_READ_NEXT
--*/
{
switch ( dwNotificationType )
{
case SF_NOTIFY_PREPROC_HEADERS:
return OnPreprocHeaders(
pfc,
(HTTP_FILTER_PREPROC_HEADERS *) pvNotification );
}
return SF_STATUS_REQ_NEXT_NOTIFICATION;
}
DWORD
OnPreprocHeaders(
HTTP_FILTER_CONTEXT * pfc,
HTTP_FILTER_PREPROC_HEADERS * pPPH
)
{
DWORD dwRet = SF_STATUS_REQ_NEXT_NOTIFICATION;
BOOL fRet = FALSE;
CHAR pBuf[ DEFAULT_BUFFER_SIZE ];
CHAR pBuf2[ DEFAULT_BUFFER_SIZE ];
CHAR * pszBuf = pBuf;
CHAR * pszBuf2 = pBuf2;
DWORD cbBuf = DEFAULT_BUFFER_SIZE;
DWORD cbBuf2 = DEFAULT_BUFFER_SIZE;
CHAR szUserAgent[] = "User-Agent:";
CHAR szUrl[] = "url";
CHAR szMSNBot[] = "msnbot/";
CHAR cbMSNBot = 7;
SetLastError( NO_ERROR );
if ( pfc == NULL ||
pPPH == NULL )
{
SetLastError( ERROR_INVALID_PARAMETER );
goto Finished;
}
fRet = pPPH->GetHeader( pfc, szUserAgent, pszBuf, &cbBuf );
if ( fRet == FALSE )
{
if ( GetLastError() == ERROR_INVALID_INDEX )
{
//
// User-Agent was not given
// Since we are looking for a particular user-agent,
// not finding on is ok.
//
OutputDebugString( "Nothing to do.\n" );
SetLastError( NO_ERROR );
goto Finished;
}
else if ( GetLastError() == ERROR_INSUFFICIENT_BUFFER &&
cbBuf < MAX_BUFFER_SIZE )
{
pszBuf = new CHAR[ cbBuf ];
if ( pszBuf == NULL )
{
SetLastError( ERROR_NOT_ENOUGH_MEMORY );
goto Finished;
}
fRet = pPPH->GetHeader( pfc, szUserAgent, pszBuf, &cbBuf );
if ( fRet == FALSE )
{
goto Finished;
}
OutputDebugString( "Found User-Agent header.\n" );
}
else
{
goto Finished;
}
}
//
// At this point, pszBuf points to the value of User-Agent header.
// Figure out what to do.
//
OutputDebugString( pszBuf );
OutputDebugString( "\n" );
//
// If the User-Agent matches the MSNBot, log request headers
//
if ( _strnicmp( pszBuf, szMSNBot, cbMSNBot ) == 0 )
{
fRet = pfc->GetServerVariable( pfc, "ALL_RAW", pszBuf2, &cbBuf2 );
if ( fRet == FALSE )
{
if ( GetLastError() == ERROR_INSUFFICIENT_BUFFER &&
cbBuf2 < MAX_BUFFER_SIZE )
{
pszBuf2 = new CHAR[ cbBuf2 ];
if ( pszBuf2 == NULL )
{
SetLastError( ERROR_NOT_ENOUGH_MEMORY );
goto Finished;
}
fRet = pfc->GetServerVariable( pfc,
"ALL_RAW",
pszBuf2,
&cbBuf2 );
if ( fRet == FALSE )
{
goto Finished;
}
OutputDebugString( "Read ALL_RAW\n" );
}
else
{
goto Finished;
}
}
//
// At this point, pszBuf2 points to the raw request headers
// But first, retrieve the URL of this request by reusing pszBuf.
// Then append it to disk, thread-safe.
//
if ( pszBuf != pBuf )
{
delete pszBuf;
pszBuf = pBuf;
}
fRet = pPPH->GetHeader( pfc, szUrl, pszBuf, &cbBuf );
if ( fRet == FALSE )
{
if ( GetLastError() == ERROR_INSUFFICIENT_BUFFER &&
cbBuf < MAX_BUFFER_SIZE )
{
pszBuf = new CHAR[ cbBuf ];
if ( pszBuf == NULL )
{
SetLastError( ERROR_NOT_ENOUGH_MEMORY );
goto Finished;
}
fRet = pPPH->GetHeader( pfc, szUrl, pszBuf, &cbBuf );
if ( fRet == FALSE )
{
goto Finished;
}
OutputDebugString( "Found URL.\n" );
}
else
{
goto Finished;
}
}
//
// At this point, pszBuf points to the raw URL...
//
OutputDebugString( "URL = " );
OutputDebugString( pszBuf );
OutputDebugString( "\n" );
OutputDebugString( "ALL_RAW =\n" );
OutputDebugString( pszBuf2 );
//
// Write the URL and raw headers to disk
//
if ( g_hLogFile != INVALID_HANDLE_VALUE )
{
EnterCriticalSection( &g_LogFileLock );
SetFilePointer( g_hLogFile, 0, NULL, FILE_END );
cbBuf--;
fRet = WriteFile( g_hLogFile, pszBuf, cbBuf, &cbBuf, NULL );
cbBuf = 1;
fRet = WriteFile( g_hLogFile, "\n", cbBuf, &cbBuf, NULL );
cbBuf2--;
fRet = WriteFile( g_hLogFile, pszBuf2, cbBuf2, &cbBuf2, NULL );
LeaveCriticalSection( &g_LogFileLock );
}
}
SetLastError( NO_ERROR );
Finished:
if ( pszBuf != pBuf )
{
delete pszBuf;
}
if ( GetLastError() != NO_ERROR )
{
OutputDebugString( "Error!\n" );
dwRet = SF_STATUS_REQ_ERROR;
}
return dwRet;
}
BOOL
WINAPI
TerminateFilter(
DWORD dwFlags
)
/*++
Optional filter entry point. This function is called by the server
before this DLL is unloaded.
Arguments:
dwFlags - No flags have been defined at this time
Returns:
Always returns TRUE;
--*/
{
if ( g_hLogFile != INVALID_HANDLE_VALUE )
{
CloseHandle( g_hLogFile );
g_hLogFile = INVALID_HANDLE_VALUE;
}
DeleteCriticalSection( &g_LogFileLock );
return TRUE;
}