/usr/web/sources/contrib/steve/root/sys/src/cmd/seft/main.c

Plan 9 from Bell Labs’s /usr/web/sources/contrib/steve/root/sys/src/cmd/seft/main.c

/*******************************************************************************
*
* Project:      seft (search engine for text)
* 
* File:         main.c
*
* Author:       Owen de Kretser (oldk@cs.mu.oz.au)
* 
* Organisation: Dept. of CS&SE, University of Melbourne
*
* Date:         April 1999
*
* Purpose:
* 
*******************************************************************************/

/***** #includes **************************************************************/

#include <stdio.h>
#include <string.h>
#include <bsd.h>
#include <stdlib.h>
#include <math.h>

#include <sys/types.h>
#include <ctype.h>
#include <limits.h>
#include <float.h>

#include "main.h"
#include "util.h"
#include "words.h"
#include "local_strings.h"
#include "heap.h"
#include "output.h"
#include "stem.h"
#include "tst.h"

/***** #defines and Macros ****************************************************/

#define K_APPROX 63              /* used to calculate approx n (unique words) */
#define B_APPROX 0.47

/***** Function Prototypes ****************************************************/

void  process_args(int argc, char* argv[], query_data *qd);
void  init_qd(query_data* qd);
void  process_query_line(query_data* qd, char* line);
void  process_query_file(query_data* qd);
void  process_text_files(query_data* qd, FILE* fptr, char** filenameptr);
void  process_text(query_data* qd, long seek_pos, char** filenameptr, char* );
void  calc_weights(query_data* qd);
void  calc_accumulators(float (*shape)(long, int),query_data* qd);
void  selection(query_data* qd);
float decay_circle(long, int);
void  usage(char* progname);

/*******************************************************************************
*
* Function: main
*
*******************************************************************************/

int
main(int argc, char* argv[])
{
    query_data qd;
    int i;
    FILE* fptr;
    
    process_args(argc,argv,&qd);

    init_qd(&qd);
    
    if (optind > (argc-1))
        usage(argv[0]);
        
    if (qd.file_name != NULL)
        {
        process_query_file(&qd);                    // query from file
        }
    else
        {
        qd.query_string = safe_strdup(argv[optind]);
        process_query_line(&qd,qd.query_string);    // query from command line
        optind++;
        }

    for (i=optind; i < argc; i++)
        {
        fptr = safe_fopen(argv[i],"r");

        process_text_files(&qd,fptr,&(argv[i]));

        fclose(fptr);
        }

    qd.N = qd.curr_pos - 1;

    calc_weights(&qd);

    calc_accumulators(decay_circle,&qd);


    selection(&qd);

    return (0);
}


/*******************************************************************************
*
* Function: init_qd
*
* Purpose:  initialise the query data structure
*
*******************************************************************************/

void
init_qd(query_data* qd)
{
    int i;
    int k = ASCII7 * ASCII7;

    qd->seek_buffer = 
        (long*)safe_malloc(qd->window_size*sizeof(long));

    for (i=0; i < qd->window_size; i++)
        {
        qd->seek_buffer[i] = 0;
        }

    qd->word_array = (WORD*)safe_malloc(INIT_WORD_SIZE*sizeof(WORD));
    qd->query_table = (TERM*)safe_malloc(INIT_TERM_SIZE*sizeof(TERM));
    qd->doc_idx =    (long*)safe_malloc(MAX_DOCS*sizeof(long));
    qd->max_words = INIT_WORD_SIZE;
    qd->max_terms = INIT_TERM_SIZE;
    qd->no_words = 0;
    qd->line_no = 0;    
    qd->curr_pos = 1;
    qd->no_docs = 0;
    qd->no_terms = 0;
    qd->tst_root = NULL;


    qd->min_length = MAX_QUERY_LEN;

    // sentinal for heap selection
    qd->word_array[0].pos = 0;
    qd->word_array[0].termptr = NULL;
    qd->word_array[0].level = FLT_MAX;


    qd->lookup = (int*)safe_malloc(k*sizeof(int));
    for (i=0; i < k; i++)
        qd->lookup[i] = 256;

}

/*******************************************************************************
*
* Function: process_args
*
* Purpose:  process the command line arguments
*
*******************************************************************************/

void 
process_args(int argc, char* argv[], query_data *qd)
{
    int ch;
    
    // set argument defaults
    
    qd->file_name    = NULL;
    qd->query_string = NULL;
    qd->window_size = DEFAULT_WINDOW_SIZE;
    qd->doc_separator = NULL;
    qd->max_windows = DEFAULT_MAX_WINDOWS;
    qd->no_output = FALSE;
    qd->no_hilite = FALSE;
    qd->stem_method = 3;
    qd->do_formfeed = FALSE;

    // process arguments
    
    while ((ch = getopt (argc, argv, "phs:xnm:f:w:d:")) != -1)
        switch (ch) 
        {
        case 'f' :                        /* input file */
            qd->file_name = strdup(optarg);
            break;
        case 'd' :
            qd->doc_separator = strdup(optarg);
            break;
        case 'w' :
            qd->window_size = atoi(optarg);
            break;
        case 'm' :
            qd->max_windows = atoi(optarg);
            break;
        case 'p' :
            qd->do_formfeed = TRUE;
            break;
        case 'x' :
            qd->no_hilite = TRUE;
            break;
        case 's' :
            qd->stem_method = atoi(optarg);
            if (qd->stem_method < 0 || qd->stem_method > 2)
                usage(argv[0]); 
            if (qd->stem_method == 2)
                qd->stem_method = 3;
            break;
        case 'n' :
            qd->no_output = TRUE;
            break;
        case 'h' :
        case '?' :
        default:
            usage(argv[0]);
      }

    if (qd->doc_separator == NULL)
            qd->doc_separator = strdup("");

}

/*******************************************************************************
*
* Function: process_query_line
*
* Purpose:  parse a query line of text and insert words into hash table
*
*******************************************************************************/

void 
process_query_line(query_data* qd, char* line)
{
    u_char *end, *s_in;
    u_char Word[MAXSTEMLEN];
    int index;
    int calc;

    if (line == NULL)
        return;

    s_in = (u_char*)line;
    end = s_in + strlen((char*)s_in) - 1;

    if (!INAWORD(*s_in))
        PARSE_NON_WORD_DISCARD(s_in, end);

    while ( s_in <= end) 
        {
        /* Get a word and stem it */
        PARSE_WORD(Word, s_in, end);

        stemmer(qd->stem_method,Word);

        if (Word[0] < qd->min_length)
            {
            qd->min_length = Word[0];
            }



        if (qd->stem_method == 0)
            {
            calc = ((Word[1] & 127u) << 7) | (Word[2] & 127u);
            if (qd->lookup[calc] > Word[0])
                {
                qd->lookup[calc] = Word[0];
                }
            }
        else
            {
            calc = ((toupper(Word[1]) & 127u) << 7) | (toupper(Word[2]) & 127u);
            if (qd->lookup[calc] > Word[0])
                {
                qd->lookup[calc] = Word[0];
                calc = ((tolower(Word[1]) & 127u) << 7) | (toupper(Word[2]) & 127u);
                qd->lookup[calc] = Word[0];
                calc = ((tolower(Word[1]) & 127u) << 7) | (tolower(Word[2]) & 127u);
                qd->lookup[calc] = Word[0];
                calc = ((toupper(Word[1]) & 127u) << 7) | (tolower(Word[2]) & 127u);
                qd->lookup[calc] = Word[0];
                }
            }

        Word[Word[0]+1] = '\0';

        index = tst_search(qd->tst_root, Word);

        if (index == NOT_FOUND)
            {
            if (qd->no_terms == qd->max_terms)
                {
                qd->query_table = (TERM*)safe_realloc(qd->query_table,
                        (qd->max_terms + INCR_TERM_SIZE)*sizeof(TERM));
                qd->max_terms += INCR_TERM_SIZE;
                }

            qd->query_table[qd->no_terms].Word = copy_string(Word);
            qd->query_table[qd->no_terms].fqt = 1;
            qd->query_table[qd->no_terms].ft = 0;
            qd->query_table[qd->no_terms].spread = 0;
            qd->query_table[qd->no_terms].init_height = 0;
            qd->query_table[qd->no_terms].term_num = qd->no_terms+1;

            qd->tst_root = tst_insert(qd->tst_root, Word, qd->no_terms);
                                        
            qd->no_terms++;
            }
         else
            {
            qd->query_table[index].fqt++;
            }

        PARSE_NON_WORD_DISCARD(s_in, end);
        }
}

/*******************************************************************************
*
* Function: process_query_file
*
* Purpose:  parse a file containing query terms
*
*******************************************************************************/

void 
process_query_file(query_data* qd)
{
    u_char line[MAX_QUERY_LEN];

    if (qd->file_name == NULL)

        return;

    qd->query_file = safe_fopen(qd->file_name,"r");

    while (fgets(line,MAX_QUERY_LEN,qd->query_file) != NULL)
    {
        process_query_line(qd, line);
    }
    
    fclose(qd->query_file);
}

/*******************************************************************************
*
* Function: calc_weights
*
* Purpose:  calculate height and spread factors (using Heap' Law)
*
*******************************************************************************/

void 
calc_weights(query_data* qd)
{
    int i;

    for (i=0; i < qd->no_terms; i++)
        {
        if (qd->query_table[i].ft != 0)
            {
        qd->query_table[i].init_height = log(((float)qd->N/(float)
                qd->query_table[i].ft)) * qd->query_table[i].fqt;
        qd->query_table[i].spread = ((K_APPROX*pow(qd->N,B_APPROX)) /
                                    (float)qd->query_table[i].ft); 
            }
        }

    for (i=1; i < qd->no_words+1; i++)
            qd->word_array[i].level = qd->word_array[i].termptr->init_height;
}


/*******************************************************************************
*
* Function: calc_accumulators
*
* Purpose:  calculate contribution curves and update location accumulators
*
*******************************************************************************/

void
calc_accumulators(float (*shape)(long,int),query_data* qd)
{

    int i,l;
    
    int doci = 0;
    long dist;
    int spread;
    float contrib;
    long lower;
    
    for (i=1; i <= qd->no_words; i++)
        {
        l = i+1;
        
        spread = qd->word_array[i].termptr->spread;
        
        while (qd->word_array[i].pos > qd->doc_idx[doci])
            {
            doci++;
            }
        while ((l <= qd->no_words) && (qd->word_array[l].pos <= qd->doc_idx[doci]))
            {
            dist = qd->word_array[l].pos - qd->word_array[i].pos;

            if (dist <= spread)
                {
                if (qd->word_array[i].termptr->term_num !=
                    qd->word_array[l].termptr->term_num)
                    {
                    contrib = qd->word_array[i].termptr->init_height *
                                            shape(dist,spread);
                    qd->word_array[l].level += contrib; 
                    }
                }
            else
                {
                break;
                }
            l++;
            }

        l = i-1;

        if (doci == 0)
            lower = 0;
        else 
            lower = qd->doc_idx[doci-1];


        while ((l > 0) && (qd->word_array[l].pos > lower))
            {
            dist =  qd->word_array[i].pos - qd->word_array[l].pos;

            if (dist <= spread)
                {
                if (qd->word_array[i].termptr->term_num !=
                                qd->word_array[l].termptr->term_num)
                    {
                    contrib = qd->word_array[i].termptr->init_height *
                                            shape(dist,spread);

                    qd->word_array[l].level += contrib;   
                    }
                }
            else
                {
                break;
                }
            l--;
            }
        }
}

/*******************************************************************************
*
* Function: decay_circle
*
* Purpose:  a particular contribution curve shape 
*
*******************************************************************************/

float
decay_circle(long dist, int D)
{
    float y;

    float point = (float)dist/(float)D;

    y = 1 - (sqrt((2*point) - (point*point)));

    return (y);
}

/*******************************************************************************
*
* Function: selection
*
* Purpose:  select top (max_windows) term locations (windows) in ranked order
*
*******************************************************************************/

void
selection(query_data* qd)
{
    WORD** ptr_array;
    int i,j;
    int heap_size = qd->no_words;
    WORD* word;
    float max_score = 0;
    int score;
    WORD** merge_search;
    bool output_flag;
    int line_min, line_max;
    
    ptr_array =  (WORD**)safe_malloc((qd->no_words+1)*sizeof(WORD*));
    merge_search = (WORD**)safe_malloc(qd->max_windows*sizeof(WORD*));

    for(i=0; i <= qd->no_words; i++)
        ptr_array[i] = &(qd->word_array[i]);


    for (i=heap_size/2; i >=1; i--)
        ptr_array = heap_down(ptr_array,i,heap_size);

    i = 1;
    while (i <= qd->max_windows)
        {
        if (heap_size == 0)
            {
            break;
            }
        word = heap_remove(ptr_array,&heap_size);
        merge_search[i-1] = word;

        if (i == 1)
            {
            max_score = word->level;
            }

        // check for window merging 
        output_flag = TRUE;

        for (j=0; j < (i-1); j++)
            {
            // check filenames
            if (strcmp(*(word->name),*(merge_search[j]->name)) == 0)
                {
                // check window boundaries
                line_min = merge_search[j]->line_no - (qd->window_size/2);
                line_max = merge_search[j]->line_no + (qd->window_size/2);
                if ((qd->window_size % 2) == 0)
                    line_max--;
                if ((word->line_no >= line_min) && (word->line_no <= line_max))
                    {
                    output_flag = FALSE;
                    break;
                    }
                }
            }

        if (output_flag == TRUE)
            {
                score = (int)((word->level*100)/max_score);
                output_window(qd, word, i, score);
                if ((qd->no_output == FALSE) && (qd->do_formfeed == TRUE))
                    fprintf(stdout,"\f\n");
                i++;
                }
            }
    }

/*******************************************************************************
*
* Function: process_text_files
*
* Purpose:  extract line of input text, buffer and note seek position
*
*******************************************************************************/

void
process_text_files(query_data* qd, FILE* fptr, char** name)
{
    int half = qd->window_size/2;
    char line[MAX_QUERY_LEN];

    if (qd->window_size%2 == 0)
        half--;

    qd->line_no = 0;

    while (fgets(line,MAX_QUERY_LEN,fptr) != NULL)
        {
        qd->seek_buffer[qd->line_no%qd->window_size] = ftell(fptr);
        process_text(qd,
                     qd->seek_buffer[(qd->line_no+half)%qd->window_size],
                     name,line);
        qd->line_no++;
        }

    qd->no_docs++;
    qd->doc_idx[qd->no_docs-1] = qd->curr_pos-1;
}

/*******************************************************************************
*
* Function: process_text
*
* Purpose:  parse lines of text, determine whether word is in hash table
*
*******************************************************************************/

void 
process_text(query_data* qd, long seek_pos, char** name, char* line)
{
    u_char *end, *s_in;
    u_char Word[MAXSTEMLEN];
    int flag = 0;
    int index;
    int calc; 


    s_in = (u_char*)line;
    end = s_in + strlen((char*)s_in) - 1;

    if (!INAWORD(*s_in))
        {
        PARSE_NON_WORD_FLAG(flag,s_in, end);

        if (flag == 1)
            {
            qd->no_docs++;
            qd->doc_idx[qd->no_docs-1] = qd->curr_pos-1;
            flag = 0;
            }
        }

    while ( s_in <= end) 
        {
        /* Get a word */
        PARSE_WORD(Word, s_in, end);


        calc = ((Word[1] & 127u) << 7) | (Word[2] & 127u);
        if (Word[0] >= qd->lookup[calc]) 
                {
                stemmer(qd->stem_method,Word);
                Word[Word[0]+1] = '\0';
                index = tst_search(qd->tst_root,Word);

                if (index != -1)
                    {
                    if (qd->no_words == (qd->max_words-1))
                        {
                        qd->word_array = (WORD*)safe_realloc(qd->word_array,
                        (qd->max_words + INCR_WORD_SIZE)*sizeof(WORD));
                        qd->max_words += INCR_WORD_SIZE;
                        }

                    qd->query_table[index].ft++;
                    qd->no_words++;

                    qd->word_array[qd->no_words].no = qd->no_words;
                    qd->word_array[qd->no_words].termptr = &( qd->query_table[index]);
                    qd->word_array[qd->no_words].pos = qd->curr_pos;
                    qd->word_array[qd->no_words].name = name;
                    qd->word_array[qd->no_words].line_no = qd->line_no+1;
                    qd->word_array[qd->no_words].seek_pos = seek_pos;
                    }
                }
       qd->curr_pos++;

       // parse non-word, check for doc separator
            
       PARSE_NON_WORD_FLAG(flag,s_in, end);
       if (flag == 1)
           {
           qd->no_docs++;
           qd->doc_idx[qd->no_docs-1] = qd->curr_pos-1;
           flag = 0;
           }
       }
}

/*******************************************************************************
*
* Function: usage
*
* Purpose:  display usage message and exit
*
*******************************************************************************/

void  usage(char* progname)
{
        fprintf(stderr, "usage: %s "
                        "[-nxh] "
                        "[-s 0|1|2] "
                        "[-f query_file] "
                        "[-w window_size] "
                        "[-m max_windows] "
                        "\"query terms\" text_files\n",
                        progname);
        exit(1);
}

/******************************************************************************/
(Return to Plan 9 Home Page)