iframe-proxy | Sunbelt Computer Software

History

10319 lines (9510 loc) · 456 KB

Raw

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

130

131

132

133

134

135

136

137

138

139

140

141

142

143

144

145

146

147

148

149

150

151

152

153

154

155

156

157

158

159

160

161

162

163

164

165

166

167

168

169

170

171

172

173

174

175

176

177

178

179

180

181

182

183

184

185

186

187

188

189

190

191

192

193

194

195

196

197

198

199

200

201

202

203

204

205

206

207

208

209

210

211

212

213

214

215

216

217

218

219

220

221

222

223

224

225

226

227

228

229

230

231

232

233

234

235

236

237

238

239

240

241

242

243

244

245

246

247

248

249

250

251

252

253

254

255

256

257

258

259

260

261

262

263

264

265

266

267

268

269

270

271

272

273

274

275

276

277

278

279

280

281

282

283

284

285

286

287

288

289

290

291

292

293

294

295

296

297

298

299

300

301

302

303

304

305

306

307

308

309

310

311

312

313

314

315

316

317

318

319

320

321

322

323

324

325

326

327

328

329

330

331

332

333

334

335

336

337

338

339

340

341

342

343

344

345

346

347

348

349

350

351

352

353

354

355

356

357

358

359

360

361

362

363

364

365

366

367

368

369

370

371

372

373

374

375

376

377

378

379

380

381

382

383

384

385

386

387

388

389

390

391

392

393

394

395

396

397

398

399

400

401

402

403

404

405

406

407

408

409

410

411

412

413

414

415

416

417

418

419

420

421

422

423

424

425

426

427

428

429

430

431

432

433

434

435

436

437

438

439

440

441

442

443

444

445

446

447

448

449

450

451

452

453

454

455

456

457

458

459

460

461

462

463

464

465

466

467

468

469

470

471

472

473

474

475

476

477

478

479

480

481

482

483

484

485

486

487

488

489

490

491

492

493

494

495

496

497

498

499

500

501

502

503

504

505

506

507

508

509

510

511

512

513

514

515

516

517

518

519

520

521

522

523

524

525

526

527

528

529

530

531

532

533

534

535

536

537

538

539

540

541

542

543

544

545

546

547

548

549

550

551

552

553

554

555

556

557

558

559

560

561

562

563

564

565

566

567

568

569

570

571

572

573

574

575

576

577

578

579

580

581

582

583

584

585

586

587

588

589

590

591

592

593

594

595

596

597

598

599

600

601

602

603

604

605

606

607

608

609

610

611

612

613

614

615

616

617

618

619

620

621

622

623

624

625

626

627

628

629

630

631

632

633

634

635

636

637

638

639

640

641

642

643

644

645

646

647

648

649

650

651

652

653

654

655

656

657

658

659

660

661

662

663

664

665

666

667

668

669

670

671

672

673

674

675

676

677

678

679

680

681

682

683

684

685

686

687

688

689

690

691

692

693

694

695

696

697

698

699

700

701

702

703

704

705

706

707

708

709

710

711

712

713

714

715

716

717

718

719

720

721

722

723

724

725

726

727

728

729

730

731

732

733

734

735

736

737

738

739

740

741

742

743

744

745

746

747

748

749

750

751

752

753

754

755

756

757

758

759

760

761

762

763

764

765

766

767

768

769

770

771

772

773

774

775

776

777

778

779

780

781

782

783

784

785

786

787

788

789

790

791

792

793

794

795

796

797

798

799

800

801

802

803

804

805

806

807

808

809

810

811

812

813

814

815

816

817

818

819

820

821

822

823

824

825

826

827

828

829

830

831

832

833

834

835

836

837

838

839

840

841

842

843

844

845

846

847

848

849

850

851

852

853

854

855

856

857

858

859

860

861

862

863

864

865

866

867

868

869

870

871

872

873

874

875

876

877

878

879

880

881

882

883

884

885

886

887

888

889

890

891

892

893

894

895

896

897

898

899

900

901

902

903

904

905

906

907

908

909

910

911

912

913

914

915

916

917

918

919

920

921

922

923

924

925

926

927

928

929

930

931

932

933

934

935

936

937

938

939

940

941

942

943

944

945

946

947

948

949

950

951

952

953

954

955

956

957

958

959

960

961

962

963

964

965

966

967

968

969

970

971

972

973

974

975

976

977

978

979

980

981

982

983

984

985

986

987

988

989

990

991

992

993

994

995

996

997

998

999

1000

#include "common.h"

extern unsigned int tagRuleCount;

unsigned int ambiguousWords;

bool reverseWords = false;

static int treetagging = 0;

int ignoreRule = -1;

static int ttLastChanged = 0;

static WORDP firstAux = NULL;

static bool ApplyRules();

static void Tags(char* buffer, int i);

static bool ProcessOmittedClause(unsigned int verb1,bool &changed) ;

static unsigned int quotationCounter;

static unsigned char clauseBit;

static unsigned char prepBit;

static unsigned char verbalBit;

static unsigned int quotationRoles[20];

static unsigned int quotationRoleIndex;

static unsigned char startStack[MAX_CLAUSES]; // where we began this level

static unsigned char auxVerbStack[MAX_CLAUSES]; // most recent aux for this level

static unsigned char objectStack[MAX_CLAUSES]; // most recent object for this level

unsigned char subjectStack[MAX_CLAUSES]; // the subject found for this level of sentence piece (tied to roleIndex)

unsigned char verbStack[MAX_CLAUSES]; // the verb found for this level of sentence piece (tied to roleIndex)

unsigned int needRoles[MAX_CLAUSES]; // what we seek in direct object land or verb land at currnet level of main/phrase/clause

unsigned int roleIndex;

static unsigned int currentMainVerb = 0;

static unsigned int currentVerb2 = 0;

static bool NounFollows( int i,bool thruComma);

char* usedTrace = NULL;

int usedWordIndex = 0;

uint64 usedType = 0;

static int firstnoun; // first noun we see in a sentence (maybe object of wrapped prep from end)

static int determineVerbal;

static int firstNounClause;

#define UNKNOWN_CONSTRAINT 2

#define NO_FIELD_INCREMENT 3

#ifdef INFORMATION

A rule consists of 4 64bit values, representing 4 comparator words (result uses one of them also), and a uint64 control word

The control word represents 6 bytes (describing how to interpret the 4 patterns and result), and a 1-byte offset locator to orient the pattern

The result is to either discard named bits or to restrict to named bits on the PRIMARY location, using the bits of the primary include...

A std rule has 4 tests it can consult, anchored around a starting word. A big rule uses a 2nd rule as a continuation, to see 4 more tests.

basic/0: 6-CONTROL_OP 3-CONTROL_FLAGS PART2_BIT ----------------3-RESULT_INDEX 48-PATTERN_BITS

value1: 6-CONTROL_OP 3-CONTROL_FLAGS KEEP_BIT ----------------3-OFFSET_SHIFT 48-PATTERN_BITS

value2: 6-CONTROL_OP 3-CONTROL_FLAGS REVERSE_BIT 1-unused 1-? 48-PATTERN_BITS

value3: 6-CONTROL_OP 3-CONTROL_FLAGS TRACE_BIT USED_BIT 2-? PART1_BIT 48-PATTERN_BITS

The pattern component identies what std word properties are to be checked for from D->properties.

Result (1st field) indicates which test has the word whose bits we want to modify.

Offset_shift (2nd field) indicates where first test word is relative to base word (+ or -)

Control flags are: SKIP, STAY, NOT.

The control_op specifies what test to perform on that field.

Rules are executed in forward order only, so later rules can presume earlier rules have already processed appropriately.

# should cardinal adjectives be under DETERMINER and not ADJECTIVE?

# the word HOME is wonderfully overloaded as noun,verb,adjective,adverb for testing

# TRACE on start of a rule result allows you to watch it

# NOGUESS # do no guessing

# INVERTWORDS # test sentence words in opposite order

# HAS = any bit matching (could be only bit it has)

# IS = bits from this collection match and no other bits are left over

# !IS will fail if result is ambiguous so it may or may not be.

# INCLUDE = has one or more of these bits AND has other bits -- DO NOT ! this, must be the ONLY field which also has * on it

# -- beware of using !IS (should use HAS) because it will match while still ambiguous

# A pattern should have only one "include", the bits you are deciding to keep or discard. Other places should use IS or HAS.

# SKIP takes a test and a value set. But it AUTOMATICALLY skips over every phrase or clause already marked

# START, END check location of this word relative to sentence start and end

# ISORIGINAL = is this word

# ISCANONICAL = is this root word

# ISMEMBER = is canonical word a direct member of this set

# HASPROPERTY = check for systemflag presence of PRONOUN_SINGULAR, OTHER_PLURAL. absence means nothing but presence is important so dont use ! (defined for determiner and pronoun)

# START = this is just before 1st word of sentence

# ISQUESTION aux or qword - sentence begins with possible aux verb or question word

# 0 = no bits

# ! inverts test

# STAY = dont move to next sentence word yet

# x means control/bits not used

# reverse means going backwards... Before the word still has offset -1 (means actually after the word)

# actions: DISCARD, KEEP, or DISABLE

# the current position should always be tested as INCLUDE, because it should have too many bits. and this must be that fields first test (to set result bits)

# tests with HAS mean it may or may not have been fully resolved yet, you are making a heuristic guess

# try to make rules self-standing, not merely a default happening after a prior rule fails to fire

# rules should be independent of each other and the order they are run in. Periodically test inverse order of rules by saying "INVERTRULES" before the first rule.

# top level parts of speech are:

# PREDETERMINER DETERMINER NOUN_BITS VERB_BITS AUX_VERB ADJECTIVE_BITS ADVERB PREPOSITION CONJUNCTION_COORDINATE CONJUNCTION_SUBORDINATE

# THERE_EXISTENTIAL TO_INFINITIVE PRONOUN_BITS POSSESSIVE_BITS COMMA PAREN PARTICLE NOUN_INFINITIVE

#endif

uint64* dataBuf;

char** commentsData;

static void DropLevel();

// zone data is transient per call to assignroles

#define ZONE_LIMIT 25

static unsigned char zoneBoundary[ZONE_LIMIT]; // where commas are

static unsigned int zoneData[ZONE_LIMIT]; // what can be found in a zone - noun before verb, verb, noun after verb

static unsigned char zoneMember[MAX_SENTENCE_LENGTH];

static unsigned int zoneIndex;

static int predicateZone; // where is main verb found

static unsigned int currentZone;

static unsigned int ambiguous;

static bool ResolveByStatistic(int i,bool &changed);

static void SetRole(int i, uint64 role, bool revise = false, int currentVerb = verbStack[roleIndex]);

#ifdef JUNK

Subject complements are after linking verbs. We label noun complements as direct objects and adjective complements as subject_complement.

Object complements follow a direct object and is noun-like or adjective-like- "The convention named him President" -- not appositive? verb takes object complement.???

"The clown got the children *excited"

Verb complement is direct or indirect object of verb.

Additionally some verbs expect object complements which are directly nouns, eg FACTITIVE_NOUN_VERB "we elected him *president" which has an omitted "as" - we elected him as president

Some verbs expect object complements to be adjectives. eg FACTITIVE_ADJECTIVE_VERB "we made him *happy"

Some verbs expect object complements to be infinitive eg VERB_TAKES_INDIRECT_THEN_VERBINFINITIVE "we want him to go"

if (parseFlags[i] & (FACTITIVE_ADJECTIVE_VERB|FACTITIVE_NOUN_VERB)) needRoles[roleIndex] |= OBJECT_COMPLEMENT; //

Two nouns in a row:

1. appositive (renaming a noun after - "my *dog *Bob") - can be on any noun

2. Adjectival noun ((char*)"*bank *teller") - can be on any noun

3. indirectobject/directobject ((char*)"I gave *Bob the *ball") - expected by verb

4. object object-complement ((char*)"the convention named *Bob *President" ) expected by verb

5. omitted clause starter?

To Infinitives verbals can be nouns, postnominal adjectives, or adverb. Cannot be appositive.

1. postnominal adjective: "her plan to subsidize him was sound"

2. object - "she wanted to raise taxes"

3. subject- "to watch is fun"

4. adverb - "he went to college to study math"

parentheitical infinitive: to sum up, I worked

adjective_noun

How do you tell postnominal adjective from adverb--

can postnominal adjective NOT follow object of a phrase?

But "it is time to go" or "that was a sight to see"

// ~factitive_adjective_Verbs take object complement adjective after direct object noun

// ~factitive_noun_verbs take object complement noun after direct object noun

// ~adjectivecomplement_taking_noun_infinitive adjectives can take a noun to infinitive after them as adjective "I was able to go"

Basic main sentence requirements on verb are:

mainverb

mainverb subjectcomplement (linking verbs like "be" take noun or adjective as subject complement though we label noun as direct object and adjective as subject complement)

mainverb directobject (directobject can be noun, to-infinitve, infinitive, as well as clause, depending on verb)

mainverb indirectobject directobject

mainverb directobject objectcomplement (objectcomplement can be noun, infinitive, adjective depending on verb)

#endif

static char* tagOps [] =

{

(char*)"?",(char*)"HAS",(char*)"IS",(char*)"INCLUDE",(char*)"CANONLYBE",(char*)"HASORIGINAL",(char*)"PRIORPOS",(char*)"POSTPOS",(char*)"PASSIVEVERB",

(char*)"HASPROPERTY",(char*)"HASALLPROPERTIES",(char*)"HASCANONICALPROPERTY",(char*)"NOTPOSSIBLEVERBPARTICLEPAIR",

(char*)"PARSEMARK",

(char*)"ISORIGINAL",(char*)"ISCANONICAL",(char*)"PRIORCANONICAL",(char*)"ISMEMBER",(char*)"PRIORORIGINAL",(char*)"POSTORIGINAL",

(char*)"POSITION",(char*)"RESETLOCATION",

(char*)"HAS2VERBS",(char*)"ISQWORD",(char*)"ISQUESTION",(char*)"ISABSTRACT",

(char*)"POSSIBLEINFINITIVE",(char*)"POSSIBLEADJECTIVE", "POSSIBLETOLESSVERB",// 23

(char*)"POSSIBLEADJECTIVEPARTICIPLE",(char*)"HOWSTART",(char*)"POSSIBLEPHRASAL",(char*)"POSSIBLEPARTICLE",(char*)"ISCOMPARATIVE",(char*)"ISEXCLAIM",

(char*)"ISORIGINALMEMBER",(char*)"ISSUPERLATIVE",(char*)"SINGULAR",(char*)"ISPROBABLE",(char*)"PLURAL",(char*)"DUALNOUN",

};

unsigned char bitCounts[MAX_SENTENCE_LENGTH]; // number of tags still to resolve in this word position

int lastClause = 0;

int lastVerbal = 0;

int lastPhrase = 0;

int lastConjunction = 0;

static bool idiomed = false;

unsigned char quotationInProgress = 0;

#ifdef TREETAGGER

// TreeTagger is something you must license for pos-tagging a collection of foreign languages

// Buying a license will get the the library you need to load with this code

// http://www.cis.uni-muenchen.de/~schmid/tools/TreeTagger/

bool treetaggerfail = true;

#ifdef WIN32

#pragma comment(lib, "../BINARIES/treetagger.lib") // where windows library is

#endif

typedef struct {

int number_of_words; /* number of words to be tagged */

int next_word; /* needed internally */

char **word; /* array of pointers to the words */

char **inputtag; /* array of pointers to the pretagging information */

const char **resulttag;/* array of pointers to the resulting tags */

const char **lemma; /* array of pointers to the lemmas */

} TAGGER_STRUCT;

typedef char*(*FindIt)(char* word);

#ifdef WIN32

int __declspec(dllimport) init_treetagger(char *param_file_name, AllocatePtr allocator, FindIt getwordfn,int language);

double __declspec(dllimport) tag_sentence(int index, TAGGER_STRUCT *ts,int language);

void __declspec(dllimport) write_treetagger();

#else

int init_treetagger(char *param_file_name, AllocatePtr allocator, FindIt getwordfn,int language);

double tag_sentence(int index, TAGGER_STRUCT *ts,int language);

void write_treetagger();

#endif

TAGGER_STRUCT ts; /* tagger interface data structure */

TAGGER_STRUCT tschunk; /* tagger interface data structure */

char* GetTag(int i)

{

return (char*) ts.resulttag[i - 1];

}

bool MatchTag(char* tag,int i)

{

char* tttag = (char*)ts.resulttag[i - 1];

if (!stricmp(tag, "NNP") && !stricmp("NP", tttag)) return true;

else if (!stricmp(tag, "NNPS") && !stricmp("NPS", tttag)) return true;

else if (!stricmp(tag, "PRP") && !stricmp("PP", tttag)) return true;

else if (!stricmp(tag, "PRP$") && !stricmp("PP$", tttag)) return true;

else if (!stricmp(tag, "-LRB-") && !stricmp("(", tttag)) return true;

else if (!stricmp(tag, "-RRB-") && !stricmp(")", tttag)) return true;

// quotes: '/`` self-medicating/VBG ' / '' - same for both

else if (!stricmp(tag, "``") || !stricmp(tag, "''"))

{ // OVERLY GENEROUS- REDACT LATER

if (!stricmp(tttag, "``") || !stricmp(tttag, "''")) return true;

}

else if (!stricmp(tag, "MD")) // modal pennbank

{

if (!stricmp("MD", tttag)) return true; // modals (could would)

if (!stricmp("VH", tttag)) return true; // be

// infinitive

if (!stricmp("VH", tttag)) return true; // be

if (!stricmp("VHD", tttag)) return true; // have

if (!stricmp("VDD", tttag)) return true; // do

// past

if (!stricmp("VBD", tttag)) return true; // be

if (!stricmp("VHD", tttag)) return true; // have

if (!stricmp("VDD", tttag)) return true; // do

// gerund

if (!stricmp("VBG", tttag)) return true; // be

if (!stricmp("VHG", tttag)) return true; // have

if (!stricmp("VDG", tttag)) return true; // do

// past participle

if (!stricmp("VBN", tttag)) return true; // be

if (!stricmp("VHN", tttag)) return true; // have

if (!stricmp("VDN", tttag)) return true; // do

// 3rd present

if (!stricmp("VBP", tttag)) return true; // be

if (!stricmp("VHP", tttag)) return true; // have

if (!stricmp("VDP", tttag)) return true; // do

// non-3rd present

if (!stricmp("VBZ", tttag)) return true; // be

if (!stricmp("VHZ", tttag)) return true; // have

if (!stricmp("VDZ", tttag)) return true; // do

}

else if (!stricmp(tag, "VB")) // infinitive

{

if (!stricmp("VB", tttag)) return true; // be

if (!stricmp("VH", tttag)) return true; // have

if (!stricmp("VD", tttag)) return true; // do

if (!stricmp("VV", tttag)) return true; // normal

}

else if (!stricmp(tag, "VBD")) // past

{

if (!stricmp("VBD", tttag)) return true; // be

if (!stricmp("VHD", tttag)) return true; // have

if (!stricmp("VDD", tttag)) return true; // do

if (!stricmp("VVD", tttag)) return true; // normal

}

else if (!stricmp(tag, "VBG")) // gerund/pres participle

{

if (!stricmp("VBG", tttag)) return true; // be

if (!stricmp("VHG", tttag)) return true; // have

if (!stricmp("VDG", tttag)) return true; // do

if (!stricmp("VVG", tttag)) return true; // normal

}

else if (!stricmp(tag, "VBN")) // past participle

{

if (!stricmp("VBN", tttag)) return true; // be

if (!stricmp("VHN", tttag)) return true; // have

if (!stricmp("VDN", tttag)) return true; // do

if (!stricmp("VVN", tttag)) return true; // normal

}

else if (!stricmp(tag, "VBP")) // non 3rd person present

{

if (!stricmp("VBP", tttag)) return true; // be

if (!stricmp("VHP", tttag)) return true; // have

if (!stricmp("VDP", tttag)) return true; // do

if (!stricmp("VVP", tttag)) return true; // normal

}

else if (!stricmp(tag, "VBZ")) // 3rd person present

{

if (!stricmp("VBZ", tttag)) return true; // be

if (!stricmp("VHZ", tttag)) return true; // have

if (!stricmp("VDZ", tttag)) return true; // do

if (!stricmp("VVZ", tttag)) return true; // normal

}

else if (!stricmp(tag, "IN") && !stricmp("IN/that", tttag)) return true; // that

else if (!stricmp(tag, "HYPH")) // joiner ; - --

{

if (!stricmp(":", tttag)) return true;

}

// Distinguishes be(VB) and have(VH) from other(non - modal) verbs(VV)

// SENT for end - of - sentence punctuation(other punctuation tags may also differ)

else if (!stricmp(tag, tttag)) return true;

return false;

}

void MarkChunk()

{

if (tschunk.number_of_words == 0) return;

WORDP type = NULL;

int start = 0;

int i;

char word[MAX_WORD_SIZE];

*word = '~';

for (i = 0; i < wordCount; ++i)

{

char* tag = (char*)tschunk.resulttag[i];

/* Status can be B - starting a chunk or I - inside a chunk continuing a chunk or O - outside a chunk like punctuation, quotation, parentheses, coordinating conjunctions(and, or ).

English tags:

ADJC adjective chunks(not inside of noun chunks)

ADVC adverb chunks(not inside of noun or adjective chunks)

CONJC complex coordinating conjunctions such as "as well (as)" or "rather (than)"

INTJ interjection

LST enumeration symbol

NC noun chunk(non - recursive noun phrase)

PC prepositional chunk(usually embeds a noun chunk

PRT verb particle

VC verb complex

CD/B-NC

char* complex = strrchr(tag, '-'); // just before complex

if (!complex) continue;

strcpy(word+1, complex);

if (*(complex - 1) == 'B') // complex begin

{

if (type) // end prior chunk

{

MarkMeaningAndImplications(0, 0, MakeMeaning(type), start, i,FIXED, true);

}

type = StoreWord(word);

AddInternalFlag(type,CONCEPT);

start = i+1;

}

else if (*(complex - 1) == 'O') // complex output of chunk (like punctuation)

{

if (type) // end prior chunk

{

MarkMeaningAndImplications(0, 0, MakeMeaning(type), start, i, FIXED, true);

type = NULL;

}

else if (!strcmp(type->word,complex)) // I prior complex continued

{

}

else// shouldnt happen

{

}

if (type) MarkMeaningAndImplications(0, 0, MakeMeaning(type), start, wordCount, FIXED,true);

}

static void TreeTagger()

{

int bit = 1 << (languageIndex * 2);

if (multidict && !(treetagging & bit)) return; // not doing tt for this language

int i;

for (i = 0; i < wordCount; ++i)

{

ts.word[i] = wordStarts[i + 1];

ts.inputtag[i] = NULL;

}

ts.number_of_words = wordCount;

tag_sentence(0, &ts, languageIndex);

if (trace & (TRACE_PREPARE | TRACE_POS | TRACE_TREETAGGER)) Log(USERLOG, "External Tagging:\r\n");

bit = 2 << (languageIndex * 2);

if (multidict && !(treetagging & bit)) return; // not doing chunk for this language

bool chunk = strstr(treetaggerParams, "chunk") ? true : false;

if (chunk) // do chunking here but marking later

{

for (i = 0; i < wordCount; ++i)

{

tschunk.word[i] = (char*)ts.resulttag[i];

tschunk.inputtag[i] = NULL;

}

tschunk.number_of_words = wordCount;

tag_sentence(1, &tschunk, languageIndex);

int starter = -1;

char type[20];

for (i = 0; i < wordCount; ++i)

{

char status = *(strrchr((char*)tschunk.resulttag[i], '/') + 1);

char* kind = strrchr((char*)tschunk.resulttag[i], '-') + 1;

if (status == 'I') continue; // continue a chunk

if (starter >= 0) // just ended a chunk

{

if (trace & (TRACE_PREPARE | TRACE_TREETAGGER)) Log(USERLOG,"(%s) %s..%s \r\n", type, wordStarts[starter + 1], wordStarts[i]);

MarkMeaningAndImplications(0, 0, MakeMeaning(StoreWord(type)), starter + 1, i);

starter = 0;

}

if (status == 'B') // begin a chunk

{

starter = i;

strcpy(type, kind);

}

if (starter >= 0) // just ended a chunk

{

if (trace & (TRACE_PREPARE | TRACE_TREETAGGER)) Log(USERLOG,"(%s) %s..%s\r\n", type, wordStarts[starter + 1], wordStarts[wordCount]);

MarkMeaningAndImplications(0, 0, MakeMeaning(StoreWord(type)), starter + 1, wordCount);

}

// set up arrays with TT results in case they are to be adjusted

char newtag[MAX_WORD_SIZE];

for (i = 1; i <= wordCount; i++)

{

WORDP canonical = NULL;

char* lemma = (char*)ts.lemma[i - 1];

if (!lemma || !strcmp(lemma, "<unknown>")) lemma = (char*)"unknown-word";

if (strcmp(lemma, "@card@")) { // leave special card canonical alone, preserving the original number

canonical = StoreWord(lemma, 0);

wordCanonical[i] = canonical->word;

}

else

{

canonical = StoreWord(wordCanonical[i], 0); // make sure the canonical word exists

}

char* tag = (char*)ts.resulttag[i - 1];

if (!tag) tag = (char*)"unknown-tag";

*newtag = '~'; // concept from the tag

strcpy(newtag + 1, tag);

WORDP tagword = FindWord(newtag);

if (!tagword)

{

strcpy(newtag + 1, (char*)"unknown-tag");

tagword = FindWord(newtag);

}

wordTag[i] = tagword;

}

// TreeTagger results may need a bit of tweaking, so gambit a topic

char* taggingTopic = GetUserVariable((char*)"$cs_externaltag");

if (*taggingTopic)

{

char* oldhow = howTopic;

howTopic = "gambit";

int oldreuseid = currentReuseID;

int oldreusetopic = currentReuseTopic;

int topicid = FindTopicIDByName(taggingTopic);

if (topicid && !(GetTopicFlags(topicid) & TOPIC_BLOCKED))

{

CALLFRAME* frame = ChangeDepth(1, (char*)"$cs_externaltag");

int pushed = PushTopic(topicid);

if (pushed >= 0)

{

PerformTopic(GAMBIT, currentOutputBase);

if (pushed) PopTopic();

}

ChangeDepth(-1, (char*)"$cs_externaltag");

currentReuseID = oldreuseid;

currentReuseTopic = oldreusetopic;

howTopic = oldhow;

}

/* mix a bit of ours and theirs */

if (stricmp(language, "english")) for (i = 1; i <= wordCount; i++)

{

originalUpper[i] = NULL;

originalLower[i] = NULL;

// Canonicals might have adjusted lemma explicitily via SetCanon() in the $cs_externaltag topic

WORDP canonical0 = FindWord(wordCanonical[i]);

// Might have adjusted TT's initial tag via SetTag() in the $cs_externaltag topic

strcpy(newtag, wordTag[i]->word);

*newtag = '_';

WORDP X = FindWord(newtag);

int start = 1;

WORDP entry = NULL;

WORDP canonical = 0;

uint64 sysflags = 0;

uint64 cansysflags = 0;

WORDP revise;

uint64 flags = GetPosData(i, wordStarts[i], revise, entry, canonical, sysflags, cansysflags, true, false, start); // flags will be potentially beyond what is stored on the word itself (like noun_gerund) but not adjective_noun

if (revise != NULL) wordStarts[i] = revise->word;

// Reuse the lemma word unless

// - the word is a concept,

// - we have found a better version of the canonical

// - this is a number and TT thinks so too, the CS canonical will be digits

if (*wordStarts[i] == '~') { ; }

else if (canonical0->properties == 0 && canonical->properties > 0) { ; }

else if (flags & NUMBER_BITS && X && X->properties & NUMBER_BITS) { ; }

else

{

canonical = canonical0;

flags = 0;

}

if (!canonical) canonical = entry;

if (IsUpperCase(canonical->internalBits & UPPERCASE_HASH))

{

originalUpper[i] = entry;

canonicalUpper[i] = canonical;

}

else

{

originalLower[i] = entry;

canonicalLower[i] = canonical;

}

parseFlags[i] = canonical->parseBits;

posValues[i] = flags;

if (originalLower[i]) lcSysFlags[i] = sysflags; // from lower case

canSysFlags[i] = cansysflags;

if (entry && entry->properties & PART_OF_SPEECH) ++knownWords; // known as lower or upper

if (*wordStarts[i] == '~') posValues[i] = 0; // interjection

wordCanonical[i] = canonical->word;

if (X) posValues[i] |= X->properties; // english pos tag references added

}

if (trace & (TRACE_PREPARE | TRACE_POS))

{

for (i = 1; i <= wordCount; i++)

{

char* tag = (char*)ts.resulttag[i - 1];

if (chunk)

{

char* tag1 = (char*)tschunk.resulttag[i - 1];

Log(USERLOG, "%s(%s %s) ", wordStarts[i], tag1, ts.lemma[i - 1]);

}

else Log(USERLOG, "%s(%s %s) ", wordStarts[i], tag, ts.lemma[i - 1]);

}

Log(USERLOG, "\r\n\r\n");

}

static void BlendWithTreetagger(bool &changed)

{

if (!ts.number_of_words) return;

static int modified = 0;

// ts.number_of_words = 0; // do only once

char word[MAX_WORD_SIZE];

*word = '_'; // marker to keep any collision away from foreign pos

int i;

int blocked = 0;

for (i = 1; i <= wordCount; ++i)

{

strcpy(word + 1, ts.resulttag[i - 1]);

WORDP D = FindWord(word, 0, PRIMARY_CASE_ALLOWED);

if (!D) continue; // didnt find it?

uint64 bits = D->properties & TAG_TEST; // dont want general headers like NOUN or VERB

uint64 possiblebits = posValues[i] & TAG_TEST; // bits we are trying to resolve

if (bits & VERB_INFINITIVE && possiblebits & VERB_PRESENT) possiblebits |= VERB_INFINITIVE; // when we launch, we want to be right

if (bits & PREPOSITION && possiblebits & PARTICLE) bits |= PARTICLE;

if (bits & PARTICLE && possiblebits & PREPOSITION) bits |= PREPOSITION;

if (bits & NOUN_SINGULAR && possiblebits & (PRONOUN_SUBJECT | PRONOUN_OBJECT)) bits |= PRONOUN_SUBJECT | PRONOUN_OBJECT;

if (!stricmp(D->word+1,"CD") && possiblebits & (PRONOUN_SUBJECT | PRONOUN_OBJECT)) bits |= PRONOUN_SUBJECT | PRONOUN_OBJECT; // "one" as pronoun

if (bits & ADJECTIVE_NORMAL && possiblebits & DETERMINER)

{

bits ^= ADJECTIVE_NORMAL;

bits |= DETERMINER; // I could make *other arrangements

}

uint64 allowable = bits & possiblebits; // bits for this tagtype

if (allowable && allowable != possiblebits) // we can update choices

{

posValues[i] ^= possiblebits;

posValues[i] |= allowable;

bitCounts[i] = BitCount(posValues[i]);

changed = true;

if (trace & TRACE_TREETAGGER)

{

++modified;

Log(USERLOG,"Treetagger adjusted %d: \"%s\" given %s, removing: ", i, wordStarts[i], ts.resulttag[i - 1]);

uint64 lost = possiblebits ^ allowable; // these bits disappeared

uint64 bit = START_BIT;

while (bit)

{

if (lost & bit) Log(USERLOG,"%s ", FindNameByValue(bit));

bit >>= 1;

}

Log(USERLOG," CS Remain: ");

bit = START_BIT;

while (bit)

{

if (allowable & bit)

Log(USERLOG,"%s ", FindNameByValue(bit));

bit >>= 1;

}

Log(USERLOG,"\r\n");

}

break; // as soon as changed. return

}

else if (!allowable && trace & (TRACE_PREPARE | TRACE_TREETAGGER))

{

Log(USERLOG,"Treetagger could not adjust %d: \"%s\" given %s, leaves CS as: ", i, wordStarts[i], ts.resulttag[i - 1]);

uint64 bit = START_BIT;

++blocked;

while (bit)

{

if (possiblebits & bit) Log(USERLOG,"%s ", FindNameByValue(bit));

bit >>= 1;

}

Log(USERLOG,"instead of changing to:");

uint64 lost = bits & (-1 ^ possiblebits); // these bits disappeared from treetagger

bit = START_BIT;

while (bit)

{

if (lost & bit) Log(USERLOG,"%s ", FindNameByValue(bit));

bit >>= 1;

}

Log(USERLOG,"\r\n");

}

else if (trace & (TRACE_PREPARE | TRACE_TREETAGGER))

{

Log(USERLOG,"Treetagger matches %d: \"%s\" TT: %s CS: ", i, wordStarts[i], ts.resulttag[i - 1]);

uint64 bit = START_BIT;

while (bit)

{

if (allowable & bit)

Log(USERLOG,"%s ", FindNameByValue(bit));

bit >>= 1;

}

Log(USERLOG,"\r\n");

}

if (trace & (TRACE_PREPARE | TRACE_TREETAGGER)) Log(USERLOG,"\r\n");

ttLastChanged = i; // or it ran out

if (i > wordCount)

{

if (trace & (TRACE_PREPARE | TRACE_TREETAGGER)) Log(USERLOG,"TT Adjustments: %d Refusals: %d Words %d\r\n\r\n", modified, blocked,wordCount);

modified = 0;

}

static void LoadTreetagger(char* language)

{

char name[MAX_WORD_SIZE];

char lang[MAX_WORD_SIZE];

MakeUpperCopy(lang, language);

if (*treetaggerParams != '1' && !strstr(lang, treetaggerParams)) return; // dont load this

MakeLowerCopy(lang, language);

sprintf(name, "DICT/%s_tags.txt", lang);

if (!ReadForeignPosTags(name)) return; //failed

externalTagger |= 2 << languageIndex; // using external tagging for this language

char langfile[MAX_WORD_SIZE];

sprintf(langfile, "treetagger/%s.par", language);

MakeLowerCase(langfile);

char* heapstart = heapFree;

//write_treetagger();

bool result = init_treetagger(langfile, AllocateConstHeap, GetWord, languageIndex); // NULL, NULL or AllocateHeap, GetWord); /* Initialization of the tagger with the language parameter file */

if (!result)

{

(*printer)(" Unable to load %s\r\n",langfile);

treetaggerfail = true;

return;

}

externalPostagger = TreeTagger;

if (multidict) treetagging |= 1 << (languageIndex * 2);

if (strstr(treetaggerParams, "chunk") && result && !stricmp(language, "german"))

{

sprintf(langfile, "treetagger/%s_chunker.par", language);

MakeLowerCase(langfile);

result = init_treetagger(langfile, AllocateConstHeap, GetWord, languageIndex); // NULL, NULL); /* Initialization of the tagger with the chunker parameter file */

if (!result) strcpy(treetaggerParams, "1"); // give up chunking

else if (multidict)

{

treetagging |= 2 << (languageIndex * 2);

printf("Loaded chunking %s\r\n", language);

}

unsigned int diff = heapstart - heapFree;

printf("Loaded treetagger %s (%dMB)", language,diff/1000000);

if (result) (*printer)("\r\n");

else (*printer)(" Unable to load chunk file\r\n");

}

void InitTreeTagger(char* params) // tags=xxxx - just triggers this thing

{

treetaggerfail = true;

if (!*params) return;

treetaggerfail = false;

// load each foreign postag and its correspondence to english postags

WalkLanguages(LoadTreetagger); // can set treetaggerfail

/* Memory allocation (the maximal input sentence length is here 1000) */

ts.word = (char**)AllocateHeap(NULL, sizeof(char*) * MAX_SENTENCE_LENGTH);

ts.inputtag = (char**)AllocateHeap(NULL, sizeof(char*) * MAX_SENTENCE_LENGTH, true);

ts.resulttag = (const char**)AllocateHeap(NULL, sizeof(char*) * MAX_SENTENCE_LENGTH, true);

ts.lemma = (const char**)AllocateHeap(NULL, sizeof(char*) * MAX_SENTENCE_LENGTH, true);

tschunk.word = (char**)AllocateHeap(NULL, sizeof(char*) * MAX_SENTENCE_LENGTH);

tschunk.inputtag = (char**)AllocateHeap(NULL, sizeof(char*) * MAX_SENTENCE_LENGTH, true);

tschunk.resulttag = (const char**)AllocateHeap(NULL, sizeof(char*) * MAX_SENTENCE_LENGTH, true);

tschunk.lemma = (const char**)AllocateHeap(NULL, sizeof(char*) * MAX_SENTENCE_LENGTH, true);

}

#endif

static void DumpCrossReference(int start, int end)

{

Log(USERLOG,"Xref: ");

for (int i = start; i <= end; ++i)

{

Log(USERLOG,"%d:%s",i,wordStarts[i]);

if (crossReference[i]) Log(USERLOG," >%d",crossReference[i]);

if (indirectObjectRef[i]) Log(USERLOG," i%d",indirectObjectRef[i]);

if (objectRef[i]) Log(USERLOG," o%d",objectRef[i]);

if (complementRef[i]) Log(USERLOG," c%d",complementRef[i]);

Log(USERLOG," ");

}

Log(USERLOG,"\r\n");

for (int i = start; i <= end; ++i)

{

Log(USERLOG,"%d:%s",i,wordStarts[i]);

if (phrases[i]) Log(USERLOG," p%x",phrases[i]);

if (verbals[i]) Log(USERLOG," v%x",verbals[i]);

if (clauses[i]) Log(USERLOG," c%x",clauses[i]);

Log(USERLOG," ");

}

Log(USERLOG,"\r\n");

}

void CheckParseLimit(char* input)

{

if (parseLimit && strlen(input) > parseLimit)

{

parseLimited = true;

tokenControl &= -1 ^ (DO_POSTAG | DO_PARSE | DO_SPELLCHECK);

}

static void SetCanonicalValue(int start,int end)

{

int upper = 0;

int lower = 0;

for (int i = start; i <= end; ++i)

{

if (ignoreWord[i]) continue;

if (IsUpperCase(*wordStarts[i])) ++upper;

else ++lower;

}

bool caseSignificant = (lower > 3 && lower > upper);

bool csEnglish = !stricmp(language, "english");

// now set canonical lowercase forms

for (int i = start; i <= end; ++i)

{

if (ignoreWord[i]) continue;

char* original = wordStarts[i];

WORDP can = canonicalLower[i];

if (originalLower[i]) original = originalLower[i]->word;

uint64 pos = posValues[i] & (TAG_TEST|PART_OF_SPEECH);

if (!pos && !(*original == '~')) posValues[i] = pos = NOUN; // default it back to something

WORDP D = FindWord(original);

WORDP canon1 = (D) ? GetCanonical(D) : NULL;

char* canon = (canon1) ? canon1->word : NULL;

if (csEnglish && posValues[i] & (DETERMINER| IDIOM) && original[1] == 0) // treat "a" as not a letter A

{

canon = NULL;

canonicalLower[i] = originalLower[i];

continue;

}

else if (!original[1] && !IsAlphaUTF8(*original)) // symbols, punctuation, etc

{

canon = NULL;

canonicalLower[i] = originalLower[i];

continue;

}

else if (csEnglish && allOriginalWordBits[i] & CONJUNCTION )

{

if (!stricmp(wordStarts[i], "times")) // a conjunction looking like plural that in singular is a normal word

{

canonicalLower[i] = FindWord("time", 0);

}

else canonicalLower[i] = originalLower[i];

continue;

}

// a word like "won" has noun, verb, adjective meanings. We prefer a canonical that's different from the original

if (csEnglish && canon && IsUpperCase(*canon)) canonicalUpper[i] = FindWord(canon);

else if (csEnglish && canon) canonicalLower[i] = FindWord(canon);

else if (pos & NUMBER_BITS); // must occur before verbs and nouns, since "second" is a verb and a noun

else if (canonicalLower[i] && canonicalLower[i]->properties & (NOUN_NUMBER|ADJECTIVE_NUMBER)); // dont change canonical numbers like December second

else if (csEnglish && allOriginalWordBits[i] & NOUN_GERUND) // because singing is a dict word, we might prefer noun over gerund. We shouldned

{

canonicalLower[i] = FindWord(GetInfinitive(original,false));

}

else if (csEnglish && pos & (VERB_BITS | NOUN_GERUND | NOUN_INFINITIVE|ADJECTIVE_PARTICIPLE) )

{

canonicalLower[i] = FindWord(GetInfinitive(original,false));

}

else if (csEnglish && pos & ADJECTIVE_NORMAL && !(D && D->properties & (MORE_FORM|MOST_FORM)))

{

canonicalLower[i] = originalLower[i]; // "his *fixed view should be adjective and not participle given it is an adjective- arbitrary

if (allOriginalWordBits[i] & ADJECTIVE_PARTICIPLE)

{

char* verb = GetInfinitive(wordStarts[i],true);

if (verb) canonicalLower[i] = FindWord(verb);

}

else if (csEnglish && ((pos & (NOUN_BITS - NOUN_GERUND - NOUN_ADJECTIVE)) || (canonicalLower[i] && !stricmp(canonicalLower[i]->word,original))))

{

if (pos & (NOUN_PROPER_SINGULAR|NOUN_PROPER_PLURAL) && canonicalUpper[i] && canonicalUpper[i]->properties & NOUN) // can it be upper case interpretation?

{

// if ONLY upper case interpretation

if (!(pos & (VERB_BITS|NOUN_SINGULAR|NOUN_PLURAL|ADJECTIVE_NOUN)) && !canonicalLower[i])

{

char* word = (originalUpper[i]) ? originalUpper[i]->word : canonicalUpper[i]->word;

word = AllocateHeap(word); // dont share because we might edit the word in place (eg ONLY_LOWER)

original = wordStarts[i] = word; // make it upper case

originalLower[i] = canonicalLower[i] = 0; // blow away any lower meaning

}

if (canonicalLower[i] && canonicalLower[i]->properties & (DETERMINER|NUMBER_BITS));

else if (IsAlphaUTF8(*original) && canonicalLower[i] == DunknownWord); // keep unknown-ness

else if (csEnglish && pos & NOUN_BITS && !canonicalUpper[i])

{

char* noun = GetSingularNoun(original,false,true);

if (noun) canonicalLower[i] = FindWord(noun);

}

else if (csEnglish && D && D->internalBits & UPPERCASE_HASH && FindWord(original, 0,LOWERCASE_LOOKUP))

{

canonicalLower[i] = FindWord(original,0, LOWERCASE_LOOKUP);

}

else if (csEnglish && ((pos & (ADJECTIVE_BITS - ADJECTIVE_PARTICIPLE - ADJECTIVE_NOUN)) || (canonicalLower[i] && !stricmp(canonicalLower[i]->word,original))))

{

if (canonicalLower[i] && canonicalLower[i]->properties & NUMBER_BITS);

else

{

char* adj = GetAdjectiveBase(original,false);

if (adj) canonicalLower[i] = FindWord(adj);

}

// for adjectives that are verbs, like married, go canonical to the verb if adjective is unchanged

if (csEnglish && canonicalLower[i] && !strcmp(canonicalLower[i]->word,original))

{

char* infinitive = GetInfinitive(original,false);

if (infinitive) canonicalLower[i] = FindWord(infinitive);

}

else if (csEnglish && pos & ADJECTIVE_NOUN)

{

if (canonicalLower[i] && canonicalLower[i]->properties & NUMBER_BITS);

else if (IsUpperCase(*wordStarts[i]) && caseSignificant) {;} // upper case is intentional

else

{

char* adj = GetAdjectiveBase(original,false);

// if (adj) canonicalLower[i] = FindWord(adj); // noun is not an adjective

}

else if (csEnglish && ((pos & ADVERB) || (canonicalLower[i] && !stricmp(canonicalLower[i]->word,original))))

{

if (canonicalLower[i] && canonicalLower[i]->properties & NUMBER_BITS);

else canonicalLower[i] = FindWord(GetAdverbBase(original,false));

// for adverbs that are adjectives, like faster, go canonical to the adjective if adverb is unchanged

if (canonicalLower[i] && !strcmp(canonicalLower[i]->word,original))

{

char* adjective = GetAdjectiveBase(original,false);

if (adjective) canonicalLower[i] = FindWord(adjective);

}

else if (*original == '~') canonicalLower[i] = FindWord(original);

else if (!IsAlphaUTF8(*original)) canonicalLower[i] = FindWord(original);

if (csEnglish && pos & PRONOUN_BITS && !stricmp(original,(char*)"one")) // make it a number

{

canonicalLower[i] = StoreWord((char*)"1",NOUN|NOUN_NUMBER, NOUN_NODETERMINER);

}

// handle composite verb canonical for single hypen case

char* hyphen = strchr(original,'-');

if (csEnglish && hyphen && pos & (VERB_BITS|NOUN_GERUND|ADJECTIVE_PARTICIPLE|NOUN_INFINITIVE)) // find the verb root.

{

char word[MAX_WORD_SIZE];

strcpy(word,original);

char* h = word + (hyphen-original);

char* verb = GetInfinitive(h+1,true);

if (verb) // 2nd half

{

strcpy(h+1,verb);

canonicalLower[i] = StoreWord(word,VERB|VERB_INFINITIVE);

}

else

{

*h = 0;

verb = GetInfinitive(word,true);

if (verb)

{

strcpy(word,verb);

strcat(word,hyphen);

canonicalLower[i] = StoreWord(word,VERB|VERB_INFINITIVE);

}

if (can == DunknownWord) // restore unknown word status

{

if (IsUpperCase(*original)) canonicalUpper[i] = can;

else canonicalLower[i] = can;

}

if (canonicalLower[i] && IsDigit(*canonicalLower[i]->word)) wordCanonical[i] = canonicalLower[i]->word; // leave numbers alone

else if (csEnglish && canonicalLower[i] && originalLower[i])

{

if (!GetCanonical(originalLower[i]) && posValues[i] & NOUN_SINGULAR && !(allOriginalWordBits[i] & NOUN_GERUND) && canonicalLower[i] != DunknownWord) // saw does not become see, it stays original - but singing should still be sing and "what do you think of dafatgat" should remain

{

canonicalLower[i] = originalLower[i];

wordCanonical[i] = originalLower[i]->word;

}

else wordCanonical[i] = canonicalLower[i]->word;

}

else if (canonicalUpper[i]) wordCanonical[i] = canonicalUpper[i]->word;

else wordCanonical[i] = wordStarts[i];

}

if (csEnglish) SetSentenceTense(start,end);

}

static char* PosBits(uint64 bits, char* buff)

{

while (bits) // shows lowest order bits first

{

uint64 oldbits = bits;

bits &= (bits - 1);

strcat(buff,(char*)" ");

strcat(buff,FindNameByValue(oldbits ^ bits));

}

return buff;

}

static int NextPos(int i)

{

while (posValues[++i] == IDIOM || posValues[i] == QUOTE ){;}

return i;

}

static int Next2Pos(int i)

{

while (posValues[++i] == IDIOM || posValues[i] == QUOTE){;}

return i;

}

static char* PropertyBits(uint64 bits, char* buff)

{

while (bits) // shows lowest order bits first

{

uint64 oldbits = bits;

bits &= (bits - 1);

strcat(buff,(char*)" ");

strcat(buff,FindSystemNameByValue(oldbits ^ bits));

}

return buff;

}

static bool LimitValues(int i, uint64 bits,char* msg,bool& changed)

{

uint64 old = posValues[i];

posValues[i] &= bits;

char buff[MAX_WORD_SIZE];

if (old != posValues[i])

{

int oldcount = bitCounts[i];

changed = true;

if (posValues[i] == 0) // shrank to nothing

{

if (bits & ADJECTIVE_NOUN) // special insertion of adjective_noun

{

posValues[i] = ADJECTIVE_NOUN;

allOriginalWordBits[i] |= ADJECTIVE_NOUN;

}

else if (bits & NOUN_ADJECTIVE) // special insertion of this

{

posValues[i] = NOUN_ADJECTIVE;

allOriginalWordBits[i] |= NOUN_ADJECTIVE;

}

else if (!bits) posValues[i] = allOriginalWordBits[i] & TAG_TEST;

else posValues[i] = bits & allOriginalWordBits[i] & TAG_TEST; // back up to what it COULD have been originally that we will now accept

if (trace & TRACE_POS)

View remainder of file in raw view

Sunbelt Computer Software

PL/B Language Development and Support

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Sunbelt Computer Software

PL/B Language Development and Support

FilesExpand file tree

englishTagger.cpp

Latest commit

History

englishTagger.cpp

File metadata and controls