diff --git a/README.md b/README.md index b7bc5538..39de75a3 100644 --- a/README.md +++ b/README.md @@ -69,6 +69,7 @@ cinatra支持通过指令集优化其内部逻辑,其通过宏来控制是否 ```shell cmake -DENABLE_SIMD=SSE42 .. # 启用sse4.2指令集 cmake -DENABLE_SIMD=AVX2 .. # 启用avx2指令集 +cmake -DENABLE_SIMD=AARCH64 .. # arm环境下,启用neon指令集 ``` # 快速示例 diff --git a/cmake/develop.cmake b/cmake/develop.cmake index 8b450cae..864fb2b5 100644 --- a/cmake/develop.cmake +++ b/cmake/develop.cmake @@ -74,7 +74,7 @@ endif() if(ENABLE_SIMD STREQUAL "SSE42" OR ENABLE_SIMD STREQUAL "AVX2" OR ENABLE_SIMD STREQUAL "AARCH64") if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") message(STATUS "Build with simd in aarch64") - add_definitions(-DCINATRA_AARCH64) + add_definitions(-DCINATRA_ARM_OPT) elseif (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") message(STATUS "Build with simd in X86_64") if (ENABLE_SIMD STREQUAL "SSE42") diff --git a/example/CMakeLists.txt b/example/CMakeLists.txt index 474f29b4..75a60689 100644 --- a/example/CMakeLists.txt +++ b/example/CMakeLists.txt @@ -22,7 +22,9 @@ endif() if (ENABLE_SIMD STREQUAL "AARCH64") if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") - #TODO + add_library(neon INTERFACE IMPORTED) + target_compile_options(neon INTERFACE -march=armv8-a+fp+simd) + target_link_libraries(${project_name} neon) endif () elseif (ENABLE_SIMD STREQUAL "SSE42") if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") diff --git a/include/cinatra/picohttpparser.h b/include/cinatra/picohttpparser.h index 081160ac..b7a8a7c3 100644 --- a/include/cinatra/picohttpparser.h +++ b/include/cinatra/picohttpparser.h @@ -32,8 +32,27 @@ #include + +#ifdef CINATRA_SSE +#ifdef _MSC_VER +#include +#else +#include +#endif +#endif + +#ifdef CINATRA_AVX2 +#include +#endif + +#ifdef CINATRA_ARM_OPT +#include +#endif + #ifdef _MSC_VER #define ssize_t intptr_t +#else +#include #endif namespace cinatra { @@ -120,6 +139,32 @@ struct phr_chunked_decoder { CHECK_EOF(); \ EXPECT_CHAR_NO_CHECK(ch); +#ifdef CINATRA_ARM_OPT +#define ADVANCE_TOKEN(tok, toklen) \ + do { \ + const char *tok_start = buf; \ + int found2; \ + buf = findchar_nonprintable_fast(buf, buf_end, &found2); \ + if (!found2) { \ + CHECK_EOF(); \ + } \ + while (1) { \ + if (*buf == ' ') { \ + break; \ + } \ + else if (unlikely(!IS_PRINTABLE_ASCII(*buf))) { \ + if ((unsigned char)*buf < '\040' || *buf == '\177') { \ + *ret = -1; \ + return NULL; \ + } \ + } \ + ++buf; \ + CHECK_EOF(); \ + } \ + tok = tok_start; \ + toklen = buf - tok_start; \ + } while (0) +#else #define ADVANCE_TOKEN(tok, toklen) \ do { \ const char *tok_start = buf; \ @@ -145,6 +190,7 @@ struct phr_chunked_decoder { tok = tok_start; \ toklen = buf - tok_start; \ } while (0) +#endif static const char *token_char_map = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0" @@ -160,18 +206,135 @@ static const char *findchar_fast(const char *buf, const char *buf_end, const char *ranges, int ranges_size, int *found) { *found = 0; +#ifdef CINATRA_SSE + if (likely(buf_end - buf >= 16)) { + __m128i ranges16 = _mm_loadu_si128((const __m128i *)ranges); + + size_t left = (buf_end - buf) & ~15; + do { + __m128i b16 = _mm_loadu_si128((const __m128i *)buf); + int r = _mm_cmpestri(ranges16, ranges_size, b16, 16, + _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES | + _SIDD_UBYTE_OPS); + if (unlikely(r != 16)) { + buf += r; + *found = 1; + break; + } + buf += 16; + left -= 16; + } while (likely(left != 0)); + } +#else /* suppress unused parameter warning */ (void)buf_end; (void)ranges; (void)ranges_size; +#endif return buf; } +static const char *findchar_nonprintable_fast(const char *buf, const char *buf_end, int *found) +{ +#ifdef CINATRA_ARM_OPT + *found = 0; + + const size_t block_size = sizeof(uint8x16_t) - 1; + const char *const end = (size_t)(buf_end - buf) >= block_size ? buf_end - block_size : buf; + + for (; buf < end; buf += sizeof(uint8x16_t)) { + uint8x16_t v = vld1q_u8((const uint8_t *)buf); + + v = vorrq_u8(vcltq_u8(v, vmovq_n_u8('\041')), vceqq_u8(v, vmovq_n_u8('\177'))); + + /* Pack the comparison result into 64 bits. */ + const uint8x8_t rv = vshrn_n_u16(vreinterpretq_u16_u8(v), 4); + uint64_t offset = vget_lane_u64(vreinterpret_u64_u8(rv), 0); + + if (offset) { + *found = 1; + __asm__("rbit %x0, %x0" : "+r"(offset)); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "Need the number of leading 0-bits in uint64_t."); + /* offset uses 4 bits per byte of input. */ + buf += __builtin_clzll(offset) / 4; + break; + } + } + + return buf; +#else + static const char ALIGNED(16) ranges2[16] = "\000\040\177\177"; + + return findchar_fast(buf, buf_end, ranges2, 4, found); +#endif +} + static const char *get_token_to_eol(const char *buf, const char *buf_end, const char **token, size_t *token_len, int *ret) { const char *token_start = buf; +#ifdef CINATRA_SSE + static const char ranges1[] = "\0\010" + /* allow HT */ + "\012\037" + /* allow SP and up to but not including DEL */ + "\177\177" + /* allow chars w. MSB set */ + ; + int found; + buf = findchar_fast(buf, buf_end, ranges1, sizeof(ranges1) - 1, &found); + if (found) + goto FOUND_CTL; +#elif defined(CINATRA_ARM_OPT) + const size_t block_size = 2 * sizeof(uint8x16_t) - 1; + const char *const end = (size_t)(buf_end - buf) >= block_size ? buf_end - block_size : buf; + + for (; buf < end; buf += 2 * sizeof(uint8x16_t)) { + const uint8x16_t space = vmovq_n_u8('\040'); + const uint8x16_t threshold = vmovq_n_u8(0137u); + const uint8x16_t v1 = vld1q_u8((const uint8_t *)buf); + const uint8x16_t v2 = vld1q_u8((const uint8_t *)buf + sizeof(v1)); + uint8x16_t v3 = vsubq_u8(v1, space); + uint8x16_t v4 = vsubq_u8(v2, space); + v3 = vcgeq_u8(v3, threshold); + v4 = vcgeq_u8(v4, threshold); + v3 = vorrq_u8(v3, v4); + /* Pack the comparison result into half a vector, i.e. 64 bits. */ + v3 = vpmaxq_u8(v3, v3); + + if (vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0)) { + const uint8x16_t del = vmovq_n_u8('\177'); + /* This mask makes it possible to pack the comparison results into half a vector, + * which has the same size as uint64_t. */ + const uint8x16_t mask = vreinterpretq_u8_u32(vmovq_n_u32(0x40100401)); + const uint8x16_t tab = vmovq_n_u8('\011'); + + v3 = vcltq_u8(v1, space); + v4 = vcltq_u8(v2, space); + v3 = vbicq_u8(v3, vceqq_u8(v1, tab)); + v4 = vbicq_u8(v4, vceqq_u8(v2, tab)); + v3 = vorrq_u8(v3, vceqq_u8(v1, del)); + v4 = vorrq_u8(v4, vceqq_u8(v2, del)); + /* After masking, four consecutive bytes in the results do not have the same bits set. */ + v3 = vandq_u8(v3, mask); + v4 = vandq_u8(v4, mask); + /* Pack the comparison results into 128, and then 64 bits. */ + v3 = vpaddq_u8(v3, v4); + v3 = vpaddq_u8(v3, v3); + + uint64_t offset = vgetq_lane_u64(vreinterpretq_u64_u8(v3), 0); + + if (offset) { + __asm__("rbit %x0, %x0" : "+r"(offset)); + static_assert(sizeof(unsigned long long) == sizeof(uint64_t), "Need the number of leading 0-bits in uint64_t."); + /* offset uses 2 bits per byte of input. */ + buf += __builtin_clzll(offset) / 2; + goto FOUND_CTL; + } + } + } +#else /* find non-printable char within the next 8 bytes, this is the hottest code; * manually inlined */ while (likely(buf_end - buf >= 8)) { @@ -198,7 +361,7 @@ static const char *get_token_to_eol(const char *buf, const char *buf_end, } ++buf; } - +#endif for (;; ++buf) { CHECK_EOF(); if (unlikely(!IS_PRINTABLE_ASCII(*buf))) { @@ -295,6 +458,318 @@ static const char *parse_http_version(const char *buf, const char *buf_end, return buf; } +#ifdef CINATRA_AVX2 +static unsigned long TZCNT(unsigned long long in) { + unsigned long res; + asm("tzcnt %1, %0\n\t" : "=r"(res) : "r"(in)); + return res; +} +/* Parse only 32 bytes */ +static void find_ranges32(__m256i b0, unsigned long *range0, unsigned long *range1) { + const __m256i rr0 = _mm256_set1_epi8(0x00 - 1); + const __m256i rr1 = _mm256_set1_epi8(0x1f + 1); + const __m256i rr2 = _mm256_set1_epi8(0x3a); + const __m256i rr4 = _mm256_set1_epi8(0x7f); + const __m256i rr7 = _mm256_set1_epi8(0x09); + + /* 0<=x */ + __m256i gz0 = _mm256_cmpgt_epi8(b0, rr0); + /* 0== 96) { + b0 = _mm256_loadu_si256((void*) buf + 32*0); + b1 = _mm256_loadu_si256((void*) buf + 32*1); + b2 = _mm256_loadu_si256((void*) buf + 32*2); + b3 = _mm256_loadu_si256((void*) tmpbuf); + } else if (dist >= 64) { + b0 = _mm256_loadu_si256((void*) buf + 32*0); + b1 = _mm256_loadu_si256((void*) buf + 32*1); + b2 = _mm256_loadu_si256((void*) tmpbuf); + b3 = _mm256_setzero_si256(); + } else { + if(dist < 32) { + b0 = _mm256_loadu_si256((void*)tmpbuf); + return find_ranges32(b0, range0, range1); + } else { + b0 = _mm256_loadu_si256((void*) buf + 32*0); + b1 = _mm256_loadu_si256((void*)tmpbuf); + return find_ranges64(b0, b1, range0, range1); + } + } + } else { + /* Load 128 bytes */ + b0 = _mm256_loadu_si256((void*) buf + 32*0); + b1 = _mm256_loadu_si256((void*) buf + 32*1); + b2 = _mm256_loadu_si256((void*) buf + 32*2); + b3 = _mm256_loadu_si256((void*) buf + 32*3); + } + + /* 0<=x */ + __m256i gz0 = _mm256_cmpgt_epi8(b0, rr0); + __m256i gz1 = _mm256_cmpgt_epi8(b1, rr0); + __m256i gz2 = _mm256_cmpgt_epi8(b2, rr0); + __m256i gz3 = _mm256_cmpgt_epi8(b3, rr0); + /* 0== 65 && * buf <= 90)) { + if (! token_char_map[(unsigned char)*buf]) { + *ret = -1; + *num_headers = n_headers; + return NULL; + } + name = buf; + + /* Attempt to find a match in the index */ + found = 0; + do { + unsigned long distance = buf - prep_start; + /* Check if the bitmaps are still valid. An assumption I make is that + buf > 128 (i.e. the os will never allocate memory at address 0-128 */ + if(unlikely(distance >= 128)) { /* Bitmaps are too old, make new ones */ + prep_start = buf; + distance = 0; + find_ranges(buf, buf_end, rr0, rr1); + } else if(distance >= 64) { /* In the second half of the bitmap */ + unsigned long index = rr0[1] >> (distance - 64); /* Correct offset of the bitmap */ + unsigned long find = TZCNT(index); /* Fine next set bit */ + if((find < 64)) { /* Yey, we found a token */ + buf += find; + found = 1; + break; + } + buf = prep_start + 128; /* No token was found in the current bitmap */ + continue; + } + unsigned long index = rr0[0] >> (distance); /* In the first half of the bitmap */ + unsigned long find = TZCNT(index); /* Find next set bit */ + if((find < 64)){ /* Token found */ + buf += find; + found = 1; + break; + } /* Token not found, look at second half of bitmap */ + index = rr0[1]; + find = TZCNT(index); + if((find < 64)){ + buf += 64+find - distance; + found = 1; + break; + } + + buf = prep_start + 128; + } while (buf < buf_end); + + if(!found) + if(buf >= buf_end) { + *ret = -2; + *num_headers = n_headers; + return NULL; + } + name_len = buf - name; + ++buf; + CHECK_EOF(); + while( (*buf == ' ' || *buf == '\t') ) { + ++buf; + CHECK_EOF(); + } + } else { + name = NULL; + name_len = 0; + } + const char* token_start = buf; + + found = 0; + + do { + /* Too far */ + unsigned long distance = buf - prep_start; /* Same algorithm as above */ + if(unlikely(distance >= 128)) { + prep_start = buf; + distance = 0; + find_ranges(buf, buf_end, rr0, rr1); + } else if(distance >= 64) { + unsigned long index = rr1[1] >> (distance - 64); + unsigned long find = TZCNT(index); + if((find < 64)) { + buf += find; + found = 1; + break; + } + buf = prep_start + 128; + continue; + } + unsigned long index = rr1[0] >> (distance); + unsigned long find = TZCNT(index); + if((find < 64)){ + buf += find; + found = 1; + break; + } + index = rr1[1]; + find = TZCNT(index); + if((find < 64)){ + buf += 64+find - distance; + found = 1; + break; + } + + buf = prep_start + 128; + } while (buf < buf_end); + + if(!found) + if(buf >= buf_end) { + *ret = -2; + *num_headers = n_headers; + return NULL; + } + + unsigned short two_char = *(unsigned short*)buf; + + if( likely(two_char == 0x0a0d) ) { + value_len = buf - token_start; + buf += 2; + } else if (unlikely(two_char & 0x0a == 0x0a)) { + value_len = buf - token_start; + ++buf; + } else { + *ret = -1; + *num_headers = n_headers; + return NULL; + } + value = token_start; + headers[*num_headers] = {std::string_view{name, name_len}, + std::string_view{value, value_len}}; + } + *num_headers = n_headers; + return buf; +} + +#else + static const char *parse_headers(const char *buf, const char *buf_end, http_header *headers, size_t *num_headers, size_t max_headers, int *ret) { @@ -372,6 +847,8 @@ static const char *parse_headers(const char *buf, const char *buf_end, return buf; } +#endif + static const char *parse_request(const char *buf, const char *buf_end, const char **method, size_t *method_len, const char **path, size_t *path_len, diff --git a/lang/english/README.md b/lang/english/README.md index 744f830a..bf51a5bb 100644 --- a/lang/english/README.md +++ b/lang/english/README.md @@ -53,6 +53,7 @@ Use the following command to compile cinatra with simd optimization.Note that on ```shell cmake -DENABLE_SIMD=SSE42 .. # enable sse4.2 instruction set cmake -DENABLE_SIMD=AVX2 .. # enable avx2 instruction set +cmake -DENABLE_SIMD=AARCH64 .. # enable neon instruction set in aarch64 ``` ## Examples diff --git a/press_tool/CMakeLists.txt b/press_tool/CMakeLists.txt index 211a413e..a96e319e 100644 --- a/press_tool/CMakeLists.txt +++ b/press_tool/CMakeLists.txt @@ -25,7 +25,9 @@ endif() if (ENABLE_SIMD STREQUAL "AARCH64") if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") - #TODO + add_library(neon INTERFACE IMPORTED) + target_compile_options(neon INTERFACE -march=armv8-a+fp+simd) + target_link_libraries(${project_name} neon) endif () elseif (ENABLE_SIMD STREQUAL "SSE42") if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index ee04bf8f..87e3eaf2 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -59,4 +59,39 @@ if (CINATRA_ENABLE_SSL) add_definitions(-DCINATRA_ENABLE_SSL) target_link_libraries(test_cinatra OpenSSL::SSL OpenSSL::Crypto) target_link_libraries(test_corofile PRIVATE OpenSSL::SSL OpenSSL::Crypto) -endif () \ No newline at end of file +endif () + +add_executable(test_http_parse + test_http_parse.cpp + ) + +if (ENABLE_SIMD STREQUAL "AARCH64") + if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "aarch64") + add_library(neon INTERFACE IMPORTED) + target_compile_options(neon INTERFACE -march=armv8-a+fp+simd) + target_link_libraries(test_http_parse neon) + endif () +elseif (ENABLE_SIMD STREQUAL "SSE42") + if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") + add_library(sse4_2 INTERFACE IMPORTED) + if(MSVC) + target_compile_options(sse4_2 INTERFACE /arch:SSE4.2) + else() + target_compile_options(sse4_2 INTERFACE -msse4.2) + endif() + target_link_libraries(test_http_parse sse4_2) + endif () +elseif (ENABLE_SIMD STREQUAL "AVX2") + if (CMAKE_HOST_SYSTEM_PROCESSOR MATCHES "x86_64") + add_library(avx2 INTERFACE IMPORTED) + if(MSVC) + target_compile_options(avx2 INTERFACE /arch:AVX2) + else() + target_compile_options(avx2 INTERFACE -mavx2) + endif() + target_link_libraries(test_http_parse avx2) + set(CMAKE_CXX_FLAGS "-fpermissive") + endif () +endif () + +add_test(NAME test_http_parse COMMAND test_http_parse) \ No newline at end of file diff --git a/tests/test_http_parse.cpp b/tests/test_http_parse.cpp new file mode 100644 index 00000000..36fa6597 --- /dev/null +++ b/tests/test_http_parse.cpp @@ -0,0 +1,55 @@ +#define DOCTEST_CONFIG_IMPLEMENT + +#include + +#include "cinatra/picohttpparser.h" +#include "doctest/doctest.h" + +using namespace cinatra; + +#define REQ \ + "GET /wp-content/uploads/2010/03/hello-kitty-darth-vader-pink.jpg " \ + "HTTP/1.1\r\n" \ + "Host: www.kittyhell.com\r\n" \ + "User-Agent: Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10.6; ja-JP-mac; " \ + "rv:1.9.2.3) Gecko/20100401 Firefox/3.6.3 " \ + "Pathtraq/0.9\r\n" \ + "Accept: " \ + "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8\r\n" \ + "Accept-Language: ja,en-us;q=0.7,en;q=0.3\r\n" \ + "Accept-Encoding: gzip,deflate\r\n" \ + "Accept-Charset: Shift_JIS,utf-8;q=0.7,*;q=0.7\r\n" \ + "Keep-Alive: 115\r\n" \ + "Connection: keep-alive\r\n" \ + "Cookie: wp_ozh_wsa_visits=2; wp_ozh_wsa_visit_lasttime=xxxxxxxxxx; " \ + "__utma=xxxxxxxxx.xxxxxxxxxx.xxxxxxxxxx.xxxxxxxxxx.xxxxxxxxxx.x; " \ + "__utmz=xxxxxxxxx.xxxxxxxxxx.x.x.utmccn=(referral)|utmcsr=reader.livedoor." \ + "com|utmcct=/reader/|utmcmd=referral\r\n" \ + "\r\n" + +TEST_CASE("http parser test") { + const char *method; + size_t method_len; + const char *path; + size_t path_len; + int minor_version; + cinatra::http_header headers[64]; + size_t num_headers; + int i, ret; + + num_headers = sizeof(headers) / sizeof(headers[0]); + ret = cinatra::detail::phr_parse_request( + REQ, sizeof(REQ) - 1, &method, &method_len, &path, &path_len, + &minor_version, headers, &num_headers, 0); + CHECK(ret == 703); + CHECK(strncmp(method, "GET", method_len) == 0); + CHECK(minor_version == 1); + std::string name(headers[0].name); + std::string value(headers[0].value); + CHECK(name == "Host"); + CHECK(value == "www.kittyhell.com"); +} + +DOCTEST_MSVC_SUPPRESS_WARNING_WITH_PUSH(4007) +int main(int argc, char **argv) { return doctest::Context(argc, argv).run(); } +DOCTEST_MSVC_SUPPRESS_WARNING_POP \ No newline at end of file