Skip to content

Commit

Permalink
Merge pull request #1528 from ampli/connector-hash-num
Browse files Browse the repository at this point in the history
Improve the connector hashing and extend condesc_t
  • Loading branch information
linas authored May 20, 2024
2 parents 213e9e7 + 244e053 commit 52bf569
Show file tree
Hide file tree
Showing 19 changed files with 169 additions and 115 deletions.
63 changes: 33 additions & 30 deletions link-grammar/connectors.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ static unsigned int get_connector_length_limit(condesc_t *cd,

int short_len = opts->short_length;
bool all_short = opts->all_short;
int length_limit = cd->length_limit;
int length_limit = cd->more->length_limit;

if ((all_short && (length_limit > short_len)) || (0 == length_limit))
return short_len;
Expand Down Expand Up @@ -165,7 +165,7 @@ static void set_condesc_length_limit(Dictionary dict, const Exp *e, int length_l
if (econlist[en]->uc_num != sdesc[cn]->uc_num) continue;
restart_cn = cn;

const char *wc_str = econlist[en]->string;
const char *wc_str = econlist[en]->more->string;
char *uc_wildcard = strchr(wc_str, LENGTH_LINIT_WILD_TYPE);

for (; cn < ct->num_con; cn++)
Expand All @@ -181,11 +181,11 @@ static void set_condesc_length_limit(Dictionary dict, const Exp *e, int length_l
else
{
/* The uppercase part is a prefix. */
if (0 != strncmp(wc_str, sdesc[cn]->string, uc_wildcard - wc_str))
if (0 != strncmp(wc_str, sdesc[cn]->more->string, uc_wildcard - wc_str))
break;
}

sdesc[cn]->length_limit = length_limit;
sdesc[cn]->more->length_limit = length_limit;
}
}
}
Expand Down Expand Up @@ -221,8 +221,8 @@ static void set_all_condesc_length_limit(Dictionary dict)

for (size_t en = 0; en < ct->num_con; en++)
{
if (0 == sdesc[en]->length_limit)
sdesc[en]->length_limit = UNLIMITED_LEN;
if (0 == sdesc[en]->more->length_limit)
sdesc[en]->more->length_limit = UNLIMITED_LEN;
}
}

Expand All @@ -234,7 +234,7 @@ static void set_all_condesc_length_limit(Dictionary dict)
for (size_t n = 0; n < ct->num_con; n++)
{
prt_error("%5zu %6u %3d %s\n\\", n, ct->sdesc[n]->uc_num,
ct->sdesc[n]->length_limit, ct->sdesc[n]->string);
ct->sdesc[n]->more->length_limit, ct->sdesc[n]->more->string);
}
prt_error("\n");
}
Expand Down Expand Up @@ -277,8 +277,8 @@ static void connector_encode_lc(const char *lc_string, condesc_t *desc)
lc_string, (int)(s-lc_string), MAX_CONNECTOR_LC_LENGTH);
}

desc->lc_mask = (lc_mask << 1) + !!(desc->flags & CD_HEAD_DEPENDENT);
desc->lc_letters = (lc_value << 1) + !!(desc->flags & CD_HEAD);
desc->lc_mask = (lc_mask << 1) + !!(desc->more->flags & CD_HEAD_DEPENDENT);
desc->lc_letters = (lc_value << 1) + !!(desc->more->flags & CD_HEAD);
}

/**
Expand All @@ -288,27 +288,27 @@ static void connector_encode_lc(const char *lc_string, condesc_t *desc)
*
* Note: check_connector() has already validated the connector string.
*/
void calculate_connector_info(condesc_t * c)
void calculate_connector_info(hdesc_t *hdesc)
{
const char *s;

s = c->string;
s = hdesc->string;
if (islower((unsigned char)*s))
{
dassert((c->string[0] == 'h') || (c->string[0] == 'd'),
"\'%c\': Bad head/dependent character", c->string[0]);
dassert((hdesc->string[0] == 'h') || (hdesc->string[0] == 'd'),
"\'%hdesc\': Bad head/dependent character", hdesc->string[0]);

if ((*s == 'h') || (*s == 'd')) c->flags |= CD_HEAD_DEPENDENT;
if (*s == 'h') c->flags |= CD_HEAD;
if ((*s == 'h') || (*s == 'd')) hdesc->flags |= CD_HEAD_DEPENDENT;
if (*s == 'h') hdesc->flags |= CD_HEAD;
s++; /* Ignore head-dependent indicator. */
}

c->uc_start = (uint8_t)(s - c->string);
hdesc->uc_start = (uint8_t)(s - hdesc->string);
/* Skip the uppercase part. */
do { s++; } while (is_connector_name_char(*s));
c->uc_length = (uint8_t)(s - c->string - c->uc_start);
hdesc->uc_length = (uint8_t)(s - hdesc->string - hdesc->uc_start);

connector_encode_lc(s, c);
connector_encode_lc(s, hdesc->desc);
}

/* ================= Connector descriptor table. ====================== */
Expand Down Expand Up @@ -373,11 +373,11 @@ int condesc_by_uc_constring(const void * a, const void * b)
if (NULL == *cda) return (NULL != *cdb);
if (NULL == *cdb) return -1;

const char *sa = &(*cda)->string[(*cda)->uc_start];
const char *sb = &(*cdb)->string[(*cdb)->uc_start];
const char *sa = &(*cda)->more->string[(*cda)->more->uc_start];
const char *sb = &(*cdb)->more->string[(*cdb)->more->uc_start];

int la = (*cda)->uc_length;
int lb = (*cdb)->uc_length;
int la = (*cda)->more->uc_length;
int lb = (*cdb)->more->uc_length;

if (la == lb)
{
Expand Down Expand Up @@ -424,7 +424,7 @@ static bool sort_condesc_by_uc_constring(Dictionary dict)
condesc_t *condesc = dict->contable.hdesc[n].desc;

if (NULL == condesc) continue;
calculate_connector_info(condesc);
calculate_connector_info(&dict->contable.hdesc[n]);
sdesc[i++] = dict->contable.hdesc[n].desc;
}

Expand All @@ -439,17 +439,17 @@ static bool sort_condesc_by_uc_constring(Dictionary dict)
{
condesc_t **condesc = &sdesc[n];

if (condesc[0]->uc_length != condesc[-1]->uc_length)
if (condesc[0]->more->uc_length != condesc[-1]->more->uc_length)

{
/* We know that the UC part has been changed. */
uc_num++;
}
else
{
const char *uc1 = &condesc[0]->string[condesc[0]->uc_start];
const char *uc2 = &condesc[-1]->string[condesc[-1]->uc_start];
if (0 != strncmp(uc1, uc2, condesc[0]->uc_length))
const char *uc1 = &condesc[0]->more->string[condesc[0]->more->uc_start];
const char *uc2 = &condesc[-1]->more->string[condesc[-1]->more->uc_start];
if (0 != strncmp(uc1, uc2, condesc[0]->more->uc_length))
{
uc_num++;
}
Expand Down Expand Up @@ -495,7 +495,7 @@ static hdesc_t *condesc_find(ConTable *ct, const char *constring, uint32_t hash)
uint32_t i = hash & (ct->size-1);

while ((NULL != ct->hdesc[i].desc) &&
!string_set_cmp(constring, ct->hdesc[i].desc->string))
!string_set_cmp(constring, ct->hdesc[i].string))
{
i = (i + 1) & (ct->size-1);
}
Expand Down Expand Up @@ -524,7 +524,7 @@ static bool condesc_grow(ConTable *ct)
{
hdesc_t *old_h = &old_hdesc[i];
if (NULL == old_h->desc) continue;
hdesc_t *new_h = condesc_find(ct, old_h->desc->string, old_h->str_hash);
hdesc_t *new_h = condesc_find(ct, old_h->string, old_h->str_hash);

if (NULL != new_h->desc)
{
Expand All @@ -533,6 +533,7 @@ static bool condesc_grow(ConTable *ct)
return false;
}
*new_h = *old_h;
new_h->desc->more = new_h;
}

free(old_hdesc);
Expand All @@ -548,9 +549,11 @@ condesc_t *condesc_add(ConTable *ct, const char *constring)
{
lgdebug(+11, "Creating connector '%s' (%zu)\n", constring, ct->num_con);
h->desc = pool_alloc(ct->mempool);
h->desc->string = constring;
h->string = constring;
h->desc->uc_num = UINT32_MAX;
h->str_hash = hash;
h->desc->more = h;
h->desc->con_num = ct->num_con;
ct->num_con++;

if ((8 * ct->num_con) > (3 * ct->size))
Expand Down
94 changes: 52 additions & 42 deletions link-grammar/connectors.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,9 @@
*/
#define MAX_SENTENCE 254 /* Maximum number of words in a sentence */

/* Length-limits for how far connectors can reach out. */
#define UNLIMITED_LEN 255

/* Since tracon IDs are unique per sentence, for convenience NULL
* connectors (zero-length tracons) have tracon IDs equal to the word
* number on which their disjunct resides. To that end an initial block
Expand Down Expand Up @@ -76,16 +79,16 @@ static inline bool is_connector_subscript_char(unsigned char c)
}
/* End of connector string character validation. */

/* Note: If more byte-size fields are needed, to save space
* uc_length and uc_start may be moved to struct hdesc. */
struct condesc_struct
{
lc_enc_t lc_letters;
lc_enc_t lc_mask;
typedef struct condesc_struct condesc_t;

/* The connector descriptors (see below) are pointed from a hash table
* with these elements. */
typedef struct hdesc
{
condesc_t *desc;
const char *string; /* The connector name w/o the direction mark, e.g. AB */
// float *cost; /* Array of cost by connector length (cost[0]: default) */
connector_uc_hash_t uc_num; /* uc part enumeration. */
// float *cost; // Array of cost by connector length (cost[0]: default)
connector_uc_hash_t str_hash;
uint8_t length_limit; /* If not 0, it gives the limit of the length of the
* link that can be used on this connector type. The
* value UNLIMITED_LEN specifies no limit.
Expand All @@ -97,11 +100,24 @@ struct condesc_struct
/* For connector match speedup when sorting the connector table. */
uint8_t uc_length; /* uc part length */
uint8_t uc_start; /* uc start position */
};
typedef struct condesc_struct condesc_t;
} hdesc_t;

/* Length-limits for how far connectors can reach out. */
#define UNLIMITED_LEN 255
/* Each connector type has a connector descriptor. The size of this
* struct is 32 byes, to facilitate CPU memory caching during parsing.
* The "more" field points to connector information that is needed in
* other steps. The con_num field is used a lot in steps that need
* connector hashing, and it is included here to avoid extra
* redirections.
* Multi connectors are considering the same type as their non-multi
* version, so the multi indication is kept in Connector_struct. */
struct condesc_struct
{
lc_enc_t lc_letters;
lc_enc_t lc_mask;
hdesc_t *more; /* More information, for keeping small struct size. */
connector_uc_hash_t uc_num; /* uc part enumeration. */
uint32_t con_num; /* Connector ordinal number. */
};

typedef struct length_limit_def
{
Expand All @@ -111,12 +127,6 @@ typedef struct length_limit_def
int length_limit;
} length_limit_def_t;

typedef struct hdesc
{
condesc_t *desc;
connector_uc_hash_t str_hash;
} hdesc_t;

typedef struct
{
hdesc_t *hdesc; /* Hashed connector descriptors table */
Expand Down Expand Up @@ -176,12 +186,17 @@ void condesc_reuse(Dictionary);
* accesses connectors */
static inline const char * connector_string(const Connector *c)
{
return c->desc->string;
return c->desc->more->string;
}

static inline unsigned int connector_uc_start(const Connector *c)
{
return c->desc->uc_start;
return c->desc->more->uc_start;
}

static inline unsigned int connector_uc_length(const Connector *c)
{
return c->desc->more->uc_length;
}

static inline const condesc_t *connector_desc(const Connector *c)
Expand All @@ -199,12 +214,16 @@ static inline unsigned int connector_uc_num(const Connector * c)
return c->desc->uc_num;
}

static inline unsigned int connector_num(const Connector * c)
{
return 2 * c->desc->con_num + c->multi;
}

/* Connector utilities ... */
Connector * connector_new(Pool_desc *, const condesc_t *);
void set_connector_farthest_word(Exp *, int, int, Parse_Options);
void free_connectors(Connector *);
void calculate_connector_info(condesc_t *);
void calculate_connector_info(hdesc_t *);
int condesc_by_uc_constring(const void *, const void *);

/**
Expand Down Expand Up @@ -310,41 +329,32 @@ static inline uint32_t string_hash(const char *s)
}

typedef uint32_t connector_hash_t;
static const connector_hash_t FIBONACCI_MULT = 0x9E3779B9;

static inline connector_hash_t connector_hash(const Connector *c)
{
// The use of (c->desc->lc_mask & 1) during hashing is important;
// See pull req #1487 for details. This raises other questions
// about hashing. Two forms are attempted below. They appear to
// be equivalent, in terms of measured elapsed-time performance.
// (I did not look at the quality of the distribution.)
// The second form uses some mixing bitshifts:
// 266281 == sum of 1 8 32 4096 (256*1024) It is a prime number
// 524429 == sum of 1 4 8 128 (512*1024) and it is a prime number
#ifdef SIMPLE_HASH
return c->desc->uc_num +
(c->multi << 19) +
(((connector_hash_t)c->desc->lc_mask & 1) << 20) +
(connector_hash_t)c->desc->lc_letters;
#else
return c->desc->uc_num +
c->multi * 266281 +
(((connector_hash_t)c->desc->lc_mask & 1) * 524429) +
((connector_hash_t)c->desc->lc_letters) * 101;
#endif
// connector_num() is different for each connector string + its multi
// attribute, and it naturally depends also on its head-dependent
// attribute, if any. For the importance of considering the
// head-dependent attribute during hashing see pull req #1487;
return connector_num(c);
}

/**
* \p c is assumed to be non-NULL.
*/
// To check hash functions, enable the "N" printing in
// eliminate_duplicate_disjuncts().
#define FEEDBACK_HASH 1
static inline connector_hash_t connector_list_hash(const Connector *c)
{
connector_hash_t accum = connector_hash(c);

for (c = c->next; c != NULL; c = c->next)
#ifdef FEEDBACK_HASH
accum = (19 * accum) + (accum >> 24) + connector_hash(c);
#if FEEDBACK_HASH
accum = (accum<<7) + (accum<<14) + (accum >> 16) - connector_hash(c);
#else
// Bad.
accum = (19 * accum) + connector_hash(c);
#endif

Expand Down
12 changes: 6 additions & 6 deletions link-grammar/dict-atomese/lookup-atomese.cc
Original file line number Diff line number Diff line change
Expand Up @@ -422,8 +422,8 @@ static void update_condesc(Dictionary dict)
if (NULL == condesc) continue;
if (UINT32_MAX != condesc->uc_num) continue;

calculate_connector_info(condesc);
condesc->length_limit = UNLIMITED_LEN;
calculate_connector_info(&ct->hdesc[n]);
condesc->more->length_limit = UNLIMITED_LEN;
sdesc[i++] = condesc;
}

Expand All @@ -438,16 +438,16 @@ static void update_condesc(Dictionary dict)
{
condesc_t **condesc = &sdesc[n];

if (condesc[0]->uc_length != condesc[-1]->uc_length)
if (condesc[0]->more->uc_length != condesc[-1]->more->uc_length)
{
/* We know that the UC part has been changed. */
uc_num++;
}
else
{
const char *uc1 = &condesc[0]->string[condesc[0]->uc_start];
const char *uc2 = &condesc[-1]->string[condesc[-1]->uc_start];
if (0 != strncmp(uc1, uc2, condesc[0]->uc_length))
const char *uc1 = &condesc[0]->more->string[condesc[0]->more->uc_start];
const char *uc2 = &condesc[-1]->more->string[condesc[-1]->more->uc_start];
if (0 != strncmp(uc1, uc2, condesc[0]->more->uc_length))
{
uc_num++;
}
Expand Down
2 changes: 1 addition & 1 deletion link-grammar/dict-atomese/word-pairs.cc
Original file line number Diff line number Diff line change
Expand Up @@ -385,7 +385,7 @@ static Exp* get_sent_pair_exprs(Dictionary dict, const Handle& germ,
{
assert(CONNECTOR_type == orch->type, "unexpected expression!");

if (links.end() != links.find(orch->condesc->string))
if (links.end() != links.find(orch->condesc->more->string))
{
Exp* cpe = (Exp*) pool_alloc(pool);
*cpe = *orch;
Expand Down
Loading

0 comments on commit 52bf569

Please sign in to comment.