-
Notifications
You must be signed in to change notification settings - Fork 7
/
html.c
143 lines (124 loc) · 4.49 KB
/
html.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
/* html.c Treaty of Babel module for HTML files
* Written 2020 By Andrew Plotkin
*
* This file depends on treaty_builder.h
*
* This file is public domain, but note that any changes to this file
* may render it noncompliant with the Treaty of Babel
*/
#define FORMAT html
#define HOME_PAGE "https://babel.ifarchive.org/"
#define FORMAT_EXT ".html"
#define NO_METADATA
#define NO_COVER
#include "treaty_builder.h"
#include "ifiction.h"
#include <ctype.h>
#include <stdio.h>
/* Searches case-insensitively for the string str in the story file.
The string must be null-terminated. The story file can have nulls
anywhere, of course.
Returns the found position, or -1 for not found.
*/
static int32 find_text_in_file(void *story_file, int32 extent, char *str)
{
int len = strlen(str);
int32 ix;
if (len == 0) {
return -1;
}
for (ix=0; ix<extent-len; ix++) {
if (strncasecmp((char *)story_file+ix, str, len) == 0)
return ix;
}
return -1;
}
/* Same as the above, except that str and str2 can be separated by any text that does not contain "<" or ">". This is a crude way to locate an HTML tag with an attribute.
Kids, don't parse HTML this way.
*/
static int32 find_text_pair_in_file(char *story_file, int32 extent, char *str, char *str2)
{
int len = strlen(str);
int len2 = strlen(str2);
int32 ix, jx;
if (len == 0) {
return -1;
}
for (ix=0; ix<extent-len; ix++) {
if (strncasecmp(story_file+ix, str, len) == 0) {
for (jx=ix+len; jx<extent-len2; jx++) {
char ch = *(char *)(story_file+jx);
if (ch == '<' || ch == '>')
break;
if (strncasecmp(story_file+jx, str2, len2) == 0) {
return ix;
}
}
}
}
return -1;
}
static int32 find_attribute_value(char *story_file, int32 extent, char *output, int32 output_extent, int32 pos, char* attribute_prefix) {
char *starttag = story_file + pos;
char *endtag = memchr(starttag, '>', extent-pos);
if (endtag) {
int32 attrpos = find_text_in_file(starttag, endtag-starttag, attribute_prefix);
if (attrpos != -1) {
attrpos += strlen(attribute_prefix);
char *endattr = memchr(starttag+attrpos, '"', endtag-(starttag+attrpos));
if (endattr) {
/* Got it. */
int32 attrlen = endattr - (starttag+attrpos);
ASSERT_OUTPUT_SIZE(attrlen+1);
memcpy(output, starttag+attrpos, attrlen);
output[attrlen] = '\0';
return VALID_STORY_FILE_RV;
}
}
}
/* Couldn't find the attribute. */
return INVALID_STORY_FILE_RV;
}
static int32 get_story_file_IFID(void *story_file, int32 extent, char *output, int32 output_extent)
{
int32 ix;
int32 pos = find_text_pair_in_file(story_file, extent, "<meta", "property=\"ifiction:ifid\"");
if (pos != -1) {
return find_attribute_value(story_file, extent, output, output_extent, pos, "content=\"");
}
/* UUID style */
ix = find_uuid_ifid_marker(story_file, extent, output, output_extent);
if (ix == VALID_STORY_FILE_RV || ix == INVALID_USAGE_RV)
return ix;
/* Twine 2 */
pos = find_text_in_file(story_file, extent, "<tw-storydata");
if (pos != -1) {
return find_attribute_value(story_file, extent, output, output_extent, pos, "ifid=\"");
}
/* Generate IFID from MD5 */
ASSERT_OUTPUT_SIZE(8);
strcpy(output,"HTML-");
return INCOMPLETE_REPLY_RV;
}
static int32 claim_story_file(void *story_file, int32 extent)
{
if (find_text_in_file(story_file, extent, "<tw-storydata") != -1) {
/* Twine 2 or later. See https://github.com/iftechfoundation/twine-specs/blob/master/twine-2-htmloutput-spec.md#story-data */
return VALID_STORY_FILE_RV;
}
if (find_text_in_file(story_file, extent, "modifier=\"twee\"") != -1) {
/* Twine 1, almost certainly. */
return VALID_STORY_FILE_RV;
}
/* every string counts as invalid HTML, but these are pretty definitive */
if (find_text_in_file(story_file, extent, "<html") != -1) {
return VALID_STORY_FILE_RV;
}
if (find_text_in_file(story_file, extent, "<!doctype html") != -1) {
return VALID_STORY_FILE_RV;
}
if (find_text_pair_in_file(story_file, extent, "<meta", "property=\"ifiction:ifid\"") != -1) {
return VALID_STORY_FILE_RV;
}
return INVALID_STORY_FILE_RV;
}