提交 d09cc0f3 编写于 作者: A Alexander Smorkalov

Merge pull request #22344 from ocpalo:libjpegturbo2.1.3

......@@ -4,9 +4,9 @@ ocv_warnings_disable(CMAKE_C_FLAGS -Wunused-parameter -Wsign-compare -Wshorten-6
set(VERSION_MAJOR 2)
set(VERSION_MINOR 1)
set(VERSION_REVISION 2)
set(VERSION_REVISION 3)
set(VERSION ${VERSION_MAJOR}.${VERSION_MINOR}.${VERSION_REVISION})
set(LIBJPEG_TURBO_VERSION_NUMBER 2001002)
set(LIBJPEG_TURBO_VERSION_NUMBER 2001003)
string(TIMESTAMP BUILD "opencv-${OPENCV_VERSION}-libjpeg-turbo")
if(CMAKE_BUILD_TYPE STREQUAL "Debug")
......@@ -79,14 +79,13 @@ configure_file(jconfigint.h.in jconfigint.h)
include_directories(${CMAKE_CURRENT_BINARY_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/src)
set(JPEG_SOURCES
jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c jcicc.c
jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c jcphuff.c
jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c jdatasrc.c
jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdicc.c jdinput.c jdmainct.c jdmarker.c
jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c jdtrans.c jerror.c
jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c jidctint.c jidctred.c
jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c)
set(JPEG_SOURCES jcapimin.c jcapistd.c jccoefct.c jccolor.c jcdctmgr.c jchuff.c
jcicc.c jcinit.c jcmainct.c jcmarker.c jcmaster.c jcomapi.c jcparam.c
jcphuff.c jcprepct.c jcsample.c jctrans.c jdapimin.c jdapistd.c jdatadst.c
jdatasrc.c jdcoefct.c jdcolor.c jddctmgr.c jdhuff.c jdicc.c jdinput.c
jdmainct.c jdmarker.c jdmaster.c jdmerge.c jdphuff.c jdpostct.c jdsample.c
jdtrans.c jerror.c jfdctflt.c jfdctfst.c jfdctint.c jidctflt.c jidctfst.c
jidctint.c jidctred.c jquant1.c jquant2.c jutils.c jmemmgr.c jmemnobs.c)
if(WITH_ARITH_ENC OR WITH_ARITH_DEC)
set(JPEG_SOURCES ${JPEG_SOURCES} jaricom.c)
......
......@@ -91,7 +91,7 @@ best of our understanding.
The Modified (3-clause) BSD License
===================================
Copyright (C)2009-2021 D. R. Commander. All Rights Reserved.<br>
Copyright (C)2009-2022 D. R. Commander. All Rights Reserved.<br>
Copyright (C)2015 Viktor Szathmáry. All Rights Reserved.
Redistribution and use in source and binary forms, with or without
......
......@@ -4,8 +4,8 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1994-1998, Thomas G. Lane.
* Modified 2003-2010 by Guido Vollbeding.
* It was modified by The libjpeg-turbo Project to include only code relevant
* to libjpeg-turbo.
* libjpeg-turbo Modifications:
* Copyright (C) 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -52,7 +52,7 @@ jpeg_CreateCompress(j_compress_ptr cinfo, int version, size_t structsize)
{
struct jpeg_error_mgr *err = cinfo->err;
void *client_data = cinfo->client_data; /* ignore Purify complaint here */
MEMZERO(cinfo, sizeof(struct jpeg_compress_struct));
memset(cinfo, 0, sizeof(struct jpeg_compress_struct));
cinfo->err = err;
cinfo->client_data = client_data;
}
......
......@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Developed 1997-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2015, 2018, D. R. Commander.
* Copyright (C) 2015, 2018, 2021-2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -338,14 +338,14 @@ emit_restart(j_compress_ptr cinfo, int restart_num)
compptr = cinfo->cur_comp_info[ci];
/* DC needs no table for refinement scan */
if (cinfo->progressive_mode == 0 || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS);
/* Reset DC predictions to 0 */
entropy->last_dc_val[ci] = 0;
entropy->dc_context[ci] = 0;
}
/* AC needs no table when not present */
if (cinfo->progressive_mode == 0 || cinfo->Se) {
MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS);
}
}
......@@ -836,7 +836,7 @@ start_pass(j_compress_ptr cinfo, boolean gather_statistics)
* We are fully adaptive here and need no extra
* statistics gathering pass!
*/
ERREXIT(cinfo, JERR_NOT_COMPILED);
ERREXIT(cinfo, JERR_NOTIMPL);
/* We assume jcmaster.c already validated the progressive scan parameters. */
......@@ -867,7 +867,7 @@ start_pass(j_compress_ptr cinfo, boolean gather_statistics)
if (entropy->dc_stats[tbl] == NULL)
entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS);
/* Initialize DC predictions to 0 */
entropy->last_dc_val[ci] = 0;
entropy->dc_context[ci] = 0;
......@@ -880,7 +880,7 @@ start_pass(j_compress_ptr cinfo, boolean gather_statistics)
if (entropy->ac_stats[tbl] == NULL)
entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS);
#ifdef CALCULATE_SPECTRAL_CONDITIONING
if (cinfo->progressive_mode)
/* Section G.1.3.2: Set appropriate arithmetic conditioning value Kx */
......
......@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2009-2011, 2014-2016, 2018-2021, D. R. Commander.
* Copyright (C) 2009-2011, 2014-2016, 2018-2022, D. R. Commander.
* Copyright (C) 2015, Matthieu Darbois.
* Copyright (C) 2018, Matthias Räncker.
* Copyright (C) 2020, Arm Limited.
......@@ -200,12 +200,12 @@ start_pass_huff(j_compress_ptr cinfo, boolean gather_statistics)
entropy->dc_count_ptrs[dctbl] = (long *)
(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
257 * sizeof(long));
MEMZERO(entropy->dc_count_ptrs[dctbl], 257 * sizeof(long));
memset(entropy->dc_count_ptrs[dctbl], 0, 257 * sizeof(long));
if (entropy->ac_count_ptrs[actbl] == NULL)
entropy->ac_count_ptrs[actbl] = (long *)
(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
257 * sizeof(long));
MEMZERO(entropy->ac_count_ptrs[actbl], 257 * sizeof(long));
memset(entropy->ac_count_ptrs[actbl], 0, 257 * sizeof(long));
#endif
} else {
/* Compute derived values for Huffman tables */
......@@ -315,8 +315,8 @@ jpeg_make_c_derived_tbl(j_compress_ptr cinfo, boolean isDC, int tblno,
* this lets us detect duplicate VAL entries here, and later
* allows emit_bits to detect any attempt to emit such symbols.
*/
MEMZERO(dtbl->ehufco, sizeof(dtbl->ehufco));
MEMZERO(dtbl->ehufsi, sizeof(dtbl->ehufsi));
memset(dtbl->ehufco, 0, sizeof(dtbl->ehufco));
memset(dtbl->ehufsi, 0, sizeof(dtbl->ehufsi));
/* This is also a convenient place to check for out-of-range
* and duplicated VAL entries. We allow 0..255 for AC symbols
......@@ -478,7 +478,7 @@ dump_buffer(working_state *state)
buffer = _buffer; \
while (bytes > 0) { \
bytestocopy = MIN(bytes, state->free_in_buffer); \
MEMCOPY(state->next_output_byte, buffer, bytestocopy); \
memcpy(state->next_output_byte, buffer, bytestocopy); \
state->next_output_byte += bytestocopy; \
buffer += bytestocopy; \
state->free_in_buffer -= bytestocopy; \
......@@ -941,8 +941,8 @@ jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
/* This algorithm is explained in section K.2 of the JPEG standard */
MEMZERO(bits, sizeof(bits));
MEMZERO(codesize, sizeof(codesize));
memset(bits, 0, sizeof(bits));
memset(codesize, 0, sizeof(codesize));
for (i = 0; i < 257; i++)
others[i] = -1; /* init links to empty */
......@@ -1044,7 +1044,7 @@ jpeg_gen_optimal_table(j_compress_ptr cinfo, JHUFF_TBL *htbl, long freq[])
bits[i]--;
/* Return final symbol counts (only for lengths 0..16) */
MEMCOPY(htbl->bits, bits, sizeof(htbl->bits));
memcpy(htbl->bits, bits, sizeof(htbl->bits));
/* Return a list of the symbols sorted by code length */
/* It's not real clear to me why we don't need to consider the codelength
......@@ -1083,8 +1083,8 @@ finish_pass_gather(j_compress_ptr cinfo)
/* It's important not to apply jpeg_gen_optimal_table more than once
* per table, because it clobbers the input frequency counts!
*/
MEMZERO(did_dc, sizeof(did_dc));
MEMZERO(did_ac, sizeof(did_ac));
memset(did_dc, 0, sizeof(did_dc));
memset(did_ac, 0, sizeof(did_ac));
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];
......
......@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1995-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2011, 2015, 2018, 2021, D. R. Commander.
* Copyright (C) 2011, 2015, 2018, 2021-2022, D. R. Commander.
* Copyright (C) 2016, 2018, Matthieu Darbois.
* Copyright (C) 2020, Arm Limited.
* Copyright (C) 2021, Alex Richardson.
......@@ -275,7 +275,7 @@ start_pass_phuff(j_compress_ptr cinfo, boolean gather_statistics)
entropy->count_ptrs[tbl] = (long *)
(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
257 * sizeof(long));
MEMZERO(entropy->count_ptrs[tbl], 257 * sizeof(long));
memset(entropy->count_ptrs[tbl], 0, 257 * sizeof(long));
} else {
/* Compute derived values for Huffman table */
/* We may do this more than once for a table, but it's not expensive */
......@@ -584,8 +584,8 @@ encode_mcu_DC_first(j_compress_ptr cinfo, JBLOCKROW *MCU_data)
continue; \
/* For a negative coef, want temp2 = bitwise complement of abs(coef) */ \
temp2 ^= temp; \
values[k] = temp; \
values[k + DCTSIZE2] = temp2; \
values[k] = (JCOEF)temp; \
values[k + DCTSIZE2] = (JCOEF)temp2; \
zerobits |= ((size_t)1U) << k; \
} \
}
......@@ -1062,7 +1062,7 @@ finish_pass_gather_phuff(j_compress_ptr cinfo)
/* It's important not to apply jpeg_gen_optimal_table more than once
* per table, because it clobbers the input frequency counts!
*/
MEMZERO(did, sizeof(did));
memset(did, 0, sizeof(did));
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];
......
......@@ -3,8 +3,8 @@
*
* This file is part of the Independent JPEG Group's software:
* Copyright (C) 1994-1996, Thomas G. Lane.
* It was modified by The libjpeg-turbo Project to include only code relevant
* to libjpeg-turbo.
* libjpeg-turbo Modifications:
* Copyright (C) 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -289,8 +289,8 @@ create_context_buffer(j_compress_ptr cinfo)
cinfo->max_h_samp_factor) / compptr->h_samp_factor),
(JDIMENSION)(3 * rgroup_height));
/* Copy true buffer row pointers into the middle of the fake row array */
MEMCOPY(fake_buffer + rgroup_height, true_buffer,
3 * rgroup_height * sizeof(JSAMPROW));
memcpy(fake_buffer + rgroup_height, true_buffer,
3 * rgroup_height * sizeof(JSAMPROW));
/* Fill in the above and below wraparound pointers */
for (i = 0; i < rgroup_height; i++) {
fake_buffer[i] = true_buffer[2 * rgroup_height + i];
......
......@@ -5,7 +5,7 @@
* Copyright (C) 1995-1998, Thomas G. Lane.
* Modified 2000-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2020, D. R. Commander.
* Copyright (C) 2020, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -100,8 +100,8 @@ jpeg_copy_critical_parameters(j_decompress_ptr srcinfo, j_compress_ptr dstinfo)
qtblptr = &dstinfo->quant_tbl_ptrs[tblno];
if (*qtblptr == NULL)
*qtblptr = jpeg_alloc_quant_table((j_common_ptr)dstinfo);
MEMCOPY((*qtblptr)->quantval, srcinfo->quant_tbl_ptrs[tblno]->quantval,
sizeof((*qtblptr)->quantval));
memcpy((*qtblptr)->quantval, srcinfo->quant_tbl_ptrs[tblno]->quantval,
sizeof((*qtblptr)->quantval));
(*qtblptr)->sent_table = FALSE;
}
}
......
......@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1994-1998, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2016, D. R. Commander.
* Copyright (C) 2016, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -53,7 +53,7 @@ jpeg_CreateDecompress(j_decompress_ptr cinfo, int version, size_t structsize)
{
struct jpeg_error_mgr *err = cinfo->err;
void *client_data = cinfo->client_data; /* ignore Purify complaint here */
MEMZERO(cinfo, sizeof(struct jpeg_decompress_struct));
memset(cinfo, 0, sizeof(struct jpeg_decompress_struct));
cinfo->err = err;
cinfo->client_data = client_data;
}
......@@ -92,7 +92,7 @@ jpeg_CreateDecompress(j_decompress_ptr cinfo, int version, size_t structsize)
cinfo->master = (struct jpeg_decomp_master *)
(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_PERMANENT,
sizeof(my_decomp_master));
MEMZERO(cinfo->master, sizeof(my_decomp_master));
memset(cinfo->master, 0, sizeof(my_decomp_master));
}
......
......@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1994-1996, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2010, 2015-2020, D. R. Commander.
* Copyright (C) 2010, 2015-2020, 2022, D. R. Commander.
* Copyright (C) 2015, Google, Inc.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
......@@ -159,6 +159,7 @@ jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
JDIMENSION input_xoffset;
boolean reinit_upsampler = FALSE;
jpeg_component_info *compptr;
my_master_ptr master = (my_master_ptr)cinfo->master;
if (cinfo->global_state != DSTATE_SCANNING || cinfo->output_scanline != 0)
ERREXIT1(cinfo, JERR_BAD_STATE, cinfo->global_state);
......@@ -208,6 +209,11 @@ jpeg_crop_scanline(j_decompress_ptr cinfo, JDIMENSION *xoffset,
*/
*width = *width + input_xoffset - *xoffset;
cinfo->output_width = *width;
if (master->using_merged_upsample && cinfo->max_v_samp_factor == 2) {
my_merged_upsample_ptr upsample = (my_merged_upsample_ptr)cinfo->upsample;
upsample->out_row_width =
cinfo->output_width * cinfo->out_color_components;
}
/* Set the first and last iMCU columns that we must decompress. These values
* will be used in single-scan decompressions.
......
......@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Developed 1997-2015 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2015-2020, D. R. Commander.
* Copyright (C) 2015-2020, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -210,13 +210,13 @@ process_restart(j_decompress_ptr cinfo)
for (ci = 0; ci < cinfo->comps_in_scan; ci++) {
compptr = cinfo->cur_comp_info[ci];
if (!cinfo->progressive_mode || (cinfo->Ss == 0 && cinfo->Ah == 0)) {
MEMZERO(entropy->dc_stats[compptr->dc_tbl_no], DC_STAT_BINS);
memset(entropy->dc_stats[compptr->dc_tbl_no], 0, DC_STAT_BINS);
/* Reset DC predictions to 0 */
entropy->last_dc_val[ci] = 0;
entropy->dc_context[ci] = 0;
}
if (!cinfo->progressive_mode || cinfo->Ss) {
MEMZERO(entropy->ac_stats[compptr->ac_tbl_no], AC_STAT_BINS);
memset(entropy->ac_stats[compptr->ac_tbl_no], 0, AC_STAT_BINS);
}
}
......@@ -471,17 +471,17 @@ decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
if (*thiscoef) { /* previously nonzero coef */
if (arith_decode(cinfo, st + 2)) {
if (*thiscoef < 0)
*thiscoef += m1;
*thiscoef += (JCOEF)m1;
else
*thiscoef += p1;
*thiscoef += (JCOEF)p1;
}
break;
}
if (arith_decode(cinfo, st + 1)) { /* newly nonzero coef */
if (arith_decode(cinfo, entropy->fixed_bin))
*thiscoef = m1;
*thiscoef = (JCOEF)m1;
else
*thiscoef = p1;
*thiscoef = (JCOEF)p1;
break;
}
st += 3; k++;
......@@ -698,8 +698,8 @@ bad:
/* Check that the scan parameters Ss, Se, Ah/Al are OK for sequential JPEG.
* This ought to be an error condition, but we make it a warning.
*/
if (cinfo->Ss != 0 || cinfo->Ah != 0 || cinfo->Al != 0 ||
(cinfo->Se < DCTSIZE2 && cinfo->Se != DCTSIZE2 - 1))
if (cinfo->Ss != 0 || cinfo->Se != DCTSIZE2 - 1 ||
cinfo->Ah != 0 || cinfo->Al != 0)
WARNMS(cinfo, JWRN_NOT_SEQUENTIAL);
/* Select MCU decoding routine */
entropy->pub.decode_mcu = decode_mcu;
......@@ -715,7 +715,7 @@ bad:
if (entropy->dc_stats[tbl] == NULL)
entropy->dc_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
((j_common_ptr)cinfo, JPOOL_IMAGE, DC_STAT_BINS);
MEMZERO(entropy->dc_stats[tbl], DC_STAT_BINS);
memset(entropy->dc_stats[tbl], 0, DC_STAT_BINS);
/* Initialize DC predictions to 0 */
entropy->last_dc_val[ci] = 0;
entropy->dc_context[ci] = 0;
......@@ -727,7 +727,7 @@ bad:
if (entropy->ac_stats[tbl] == NULL)
entropy->ac_stats[tbl] = (unsigned char *)(*cinfo->mem->alloc_small)
((j_common_ptr)cinfo, JPOOL_IMAGE, AC_STAT_BINS);
MEMZERO(entropy->ac_stats[tbl], AC_STAT_BINS);
memset(entropy->ac_stats[tbl], 0, AC_STAT_BINS);
}
}
......
......@@ -5,7 +5,7 @@
* Copyright (C) 1994-1996, Thomas G. Lane.
* Modified 2009-2012 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2013, 2016, D. R. Commander.
* Copyright (C) 2013, 2016, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -23,11 +23,6 @@
#include "jpeglib.h"
#include "jerror.h"
#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare malloc(),free() */
extern void *malloc(size_t size);
extern void free(void *ptr);
#endif
/* Expanded data destination object for stdio output */
......@@ -116,7 +111,7 @@ empty_output_buffer(j_compress_ptr cinfo)
{
my_dest_ptr dest = (my_dest_ptr)cinfo->dest;
if (JFWRITE(dest->outfile, dest->buffer, OUTPUT_BUF_SIZE) !=
if (fwrite(dest->buffer, 1, OUTPUT_BUF_SIZE, dest->outfile) !=
(size_t)OUTPUT_BUF_SIZE)
ERREXIT(cinfo, JERR_FILE_WRITE);
......@@ -141,7 +136,7 @@ empty_mem_output_buffer(j_compress_ptr cinfo)
if (nextbuffer == NULL)
ERREXIT1(cinfo, JERR_OUT_OF_MEMORY, 10);
MEMCOPY(nextbuffer, dest->buffer, dest->bufsize);
memcpy(nextbuffer, dest->buffer, dest->bufsize);
free(dest->newbuffer);
......@@ -175,7 +170,7 @@ term_destination(j_compress_ptr cinfo)
/* Write any data remaining in the buffer */
if (datacount > 0) {
if (JFWRITE(dest->outfile, dest->buffer, datacount) != datacount)
if (fwrite(dest->buffer, 1, datacount, dest->outfile) != datacount)
ERREXIT(cinfo, JERR_FILE_WRITE);
}
fflush(dest->outfile);
......
......@@ -5,7 +5,7 @@
* Copyright (C) 1994-1996, Thomas G. Lane.
* Modified 2009-2011 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2013, 2016, D. R. Commander.
* Copyright (C) 2013, 2016, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -104,7 +104,7 @@ fill_input_buffer(j_decompress_ptr cinfo)
my_src_ptr src = (my_src_ptr)cinfo->src;
size_t nbytes;
nbytes = JFREAD(src->infile, src->buffer, INPUT_BUF_SIZE);
nbytes = fread(src->buffer, 1, INPUT_BUF_SIZE, src->infile);
if (nbytes <= 0) {
if (src->start_of_file) /* Treat empty input file as fatal error */
......
......@@ -6,7 +6,7 @@
* Modified 2002-2010 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright 2009 Pierre Ossman <ossman@cendio.se> for Cendio AB
* Copyright (C) 2010, 2015, D. R. Commander.
* Copyright (C) 2010, 2015, 2022, D. R. Commander.
* Copyright (C) 2013, MIPS Technologies, Inc., California.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
......@@ -345,7 +345,7 @@ jinit_inverse_dct(j_decompress_ptr cinfo)
compptr->dct_table =
(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(multiplier_table));
MEMZERO(compptr->dct_table, sizeof(multiplier_table));
memset(compptr->dct_table, 0, sizeof(multiplier_table));
/* Mark multiplier table not yet set up for any method */
idct->cur_method[ci] = -1;
}
......
......@@ -18,10 +18,6 @@
#include "jpeglib.h"
#include "jerror.h"
#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare malloc() */
extern void *malloc(size_t size);
#endif
#define ICC_MARKER (JPEG_APP0 + 2) /* JPEG marker code for ICC */
#define ICC_OVERHEAD_LEN 14 /* size of non-profile data in APP2 */
......
......@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2010, 2016, 2018, D. R. Commander.
* Copyright (C) 2010, 2016, 2018, 2022, D. R. Commander.
* Copyright (C) 2015, Google, Inc.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
......@@ -264,7 +264,7 @@ latch_quant_tables(j_decompress_ptr cinfo)
qtbl = (JQUANT_TBL *)
(*cinfo->mem->alloc_small) ((j_common_ptr)cinfo, JPOOL_IMAGE,
sizeof(JQUANT_TBL));
MEMCOPY(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL));
memcpy(qtbl, cinfo->quant_tbl_ptrs[qtblno], sizeof(JQUANT_TBL));
compptr->quant_table = qtbl;
}
}
......
......@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1998, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2012, 2015, D. R. Commander.
* Copyright (C) 2012, 2015, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -473,7 +473,7 @@ get_dht(j_decompress_ptr cinfo)
for (i = 0; i < count; i++)
INPUT_BYTE(cinfo, huffval[i], return FALSE);
MEMZERO(&huffval[count], (256 - count) * sizeof(UINT8));
memset(&huffval[count], 0, (256 - count) * sizeof(UINT8));
length -= count;
......@@ -491,8 +491,8 @@ get_dht(j_decompress_ptr cinfo)
if (*htblptr == NULL)
*htblptr = jpeg_alloc_huff_table((j_common_ptr)cinfo);
MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
MEMCOPY((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval));
memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
memcpy((*htblptr)->huffval, huffval, sizeof((*htblptr)->huffval));
}
if (length != 0)
......
......@@ -5,7 +5,7 @@
* Copyright (C) 1991-1997, Thomas G. Lane.
* Modified 2002-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2009-2011, 2016, 2019, D. R. Commander.
* Copyright (C) 2009-2011, 2016, 2019, 2022, D. R. Commander.
* Copyright (C) 2013, Linaro Limited.
* Copyright (C) 2015, Google, Inc.
* For conditions of distribution and use, see the accompanying README.ijg
......@@ -417,7 +417,7 @@ prepare_range_limit_table(j_decompress_ptr cinfo)
table += (MAXJSAMPLE + 1); /* allow negative subscripts of simple table */
cinfo->sample_range_limit = table;
/* First segment of "simple" table: limit[x] = 0 for x < 0 */
MEMZERO(table - (MAXJSAMPLE + 1), (MAXJSAMPLE + 1) * sizeof(JSAMPLE));
memset(table - (MAXJSAMPLE + 1), 0, (MAXJSAMPLE + 1) * sizeof(JSAMPLE));
/* Main part of "simple" table: limit[x] = x */
for (i = 0; i <= MAXJSAMPLE; i++)
table[i] = (JSAMPLE)i;
......@@ -426,10 +426,10 @@ prepare_range_limit_table(j_decompress_ptr cinfo)
for (i = CENTERJSAMPLE; i < 2 * (MAXJSAMPLE + 1); i++)
table[i] = MAXJSAMPLE;
/* Second half of post-IDCT table */
MEMZERO(table + (2 * (MAXJSAMPLE + 1)),
(2 * (MAXJSAMPLE + 1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
MEMCOPY(table + (4 * (MAXJSAMPLE + 1) - CENTERJSAMPLE),
cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
memset(table + (2 * (MAXJSAMPLE + 1)), 0,
(2 * (MAXJSAMPLE + 1) - CENTERJSAMPLE) * sizeof(JSAMPLE));
memcpy(table + (4 * (MAXJSAMPLE + 1) - CENTERJSAMPLE),
cinfo->sample_range_limit, CENTERJSAMPLE * sizeof(JSAMPLE));
}
......
......@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1995-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2015-2016, 2018-2021, D. R. Commander.
* Copyright (C) 2015-2016, 2018-2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -578,9 +578,9 @@ decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
if (GET_BITS(1)) {
if ((*thiscoef & p1) == 0) { /* do nothing if already set it */
if (*thiscoef >= 0)
*thiscoef += p1;
*thiscoef += (JCOEF)p1;
else
*thiscoef += m1;
*thiscoef += (JCOEF)m1;
}
}
} else {
......@@ -612,9 +612,9 @@ decode_mcu_AC_refine(j_decompress_ptr cinfo, JBLOCKROW *MCU_data)
if (GET_BITS(1)) {
if ((*thiscoef & p1) == 0) { /* do nothing if already changed it */
if (*thiscoef >= 0)
*thiscoef += p1;
*thiscoef += (JCOEF)p1;
else
*thiscoef += m1;
*thiscoef += (JCOEF)m1;
}
}
}
......
......@@ -3,8 +3,8 @@
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1998, Thomas G. Lane.
* It was modified by The libjpeg-turbo Project to include only code relevant
* to libjpeg-turbo.
* libjpeg-turbo Modifications:
* Copyright (C) 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -189,13 +189,13 @@ format_message(j_common_ptr cinfo, char *buffer)
/* Format the message into the passed buffer */
if (isstring)
sprintf(buffer, msgtext, err->msg_parm.s);
snprintf(buffer, JMSG_LENGTH_MAX, msgtext, err->msg_parm.s);
else
sprintf(buffer, msgtext,
err->msg_parm.i[0], err->msg_parm.i[1],
err->msg_parm.i[2], err->msg_parm.i[3],
err->msg_parm.i[4], err->msg_parm.i[5],
err->msg_parm.i[6], err->msg_parm.i[7]);
snprintf(buffer, JMSG_LENGTH_MAX, msgtext,
err->msg_parm.i[0], err->msg_parm.i[1],
err->msg_parm.i[2], err->msg_parm.i[3],
err->msg_parm.i[4], err->msg_parm.i[5],
err->msg_parm.i[6], err->msg_parm.i[7]);
}
......
......@@ -5,7 +5,7 @@
* Copyright (C) 1994-1997, Thomas G. Lane.
* Modified 1997-2009 by Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2014, 2017, D. R. Commander.
* Copyright (C) 2014, 2017, 2021-2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -103,7 +103,7 @@ JMESSAGE(JERR_MISMATCHED_QUANT_TABLE,
"Cannot transcode due to multiple use of quantization table %d")
JMESSAGE(JERR_MISSING_DATA, "Scan script does not transmit all data")
JMESSAGE(JERR_MODE_CHANGE, "Invalid color quantization mode change")
JMESSAGE(JERR_NOTIMPL, "Not implemented yet")
JMESSAGE(JERR_NOTIMPL, "Requested features are incompatible")
JMESSAGE(JERR_NOT_COMPILED, "Requested feature was omitted at compile time")
#if JPEG_LIB_VERSION >= 70
JMESSAGE(JERR_NO_ARITH_TABLE, "Arithmetic table 0x%02x was not defined")
......@@ -268,6 +268,7 @@ JMESSAGE(JERR_BAD_DROP_SAMPLING,
#define ERREXITS(cinfo, code, str) \
((cinfo)->err->msg_code = (code), \
strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
(cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
(*(cinfo)->err->error_exit) ((j_common_ptr)(cinfo)))
#define MAKESTMT(stuff) do { stuff } while (0)
......@@ -324,6 +325,7 @@ JMESSAGE(JERR_BAD_DROP_SAMPLING,
#define TRACEMSS(cinfo, lvl, code, str) \
((cinfo)->err->msg_code = (code), \
strncpy((cinfo)->err->msg_parm.s, (str), JMSG_STR_PARM_MAX), \
(cinfo)->err->msg_parm.s[JMSG_STR_PARM_MAX - 1] = '\0', \
(*(cinfo)->err->emit_message) ((j_common_ptr)(cinfo), (lvl)))
#endif /* JERROR_H */
......@@ -3,8 +3,8 @@
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1994, Thomas G. Lane.
* It was modified by The libjpeg-turbo Project to include only code relevant
* to libjpeg-turbo.
* libjpeg-turbo Modifications:
* Copyright (C) 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -17,72 +17,117 @@
* JPEG library. Most applications need only include jpeglib.h.
*/
#ifndef __JINCLUDE_H__
#define __JINCLUDE_H__
/* Include auto-config file to find out which system include files we need. */
#include "jconfig.h" /* auto configuration options */
#include "jconfigint.h"
#define JCONFIG_INCLUDED /* so that jpeglib.h doesn't do it again */
/*
* We need the NULL macro and size_t typedef.
* On an ANSI-conforming system it is sufficient to include <stddef.h>.
* Otherwise, we get them from <stdlib.h> or <stdio.h>; we may have to
* pull in <sys/types.h> as well.
* Note that the core JPEG library does not require <stdio.h>;
* only the default error handler and data source/destination modules do.
* But we must pull it in because of the references to FILE in jpeglib.h.
* You can remove those references if you want to compile without <stdio.h>.
*/
#ifdef HAVE_STDDEF_H
#include <stddef.h>
#endif
#ifdef HAVE_STDLIB_H
#include <stdlib.h>
#endif
#ifdef NEED_SYS_TYPES_H
#include <sys/types.h>
#endif
#include <stdio.h>
#include <string.h>
/*
* We need memory copying and zeroing functions, plus strncpy().
* ANSI and System V implementations declare these in <string.h>.
* BSD doesn't have the mem() functions, but it does have bcopy()/bzero().
* Some systems may declare memset and memcpy in <memory.h>.
*
* NOTE: we assume the size parameters to these functions are of type size_t.
* Change the casts in these macros if not!
* These macros/inline functions facilitate using Microsoft's "safe string"
* functions with Visual Studio builds without the need to scatter #ifdefs
* throughout the code base.
*/
#ifdef NEED_BSD_STRINGS
#include <strings.h>
#define MEMZERO(target, size) \
bzero((void *)(target), (size_t)(size))
#define MEMCOPY(dest, src, size) \
bcopy((const void *)(src), (void *)(dest), (size_t)(size))
#ifndef NO_GETENV
#else /* not BSD, assume ANSI/SysV string lib */
#ifdef _MSC_VER
#include <string.h>
#define MEMZERO(target, size) \
memset((void *)(target), 0, (size_t)(size))
#define MEMCOPY(dest, src, size) \
memcpy((void *)(dest), (const void *)(src), (size_t)(size))
static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name)
{
size_t required_size;
#endif
return (int)getenv_s(&required_size, buffer, buffer_size, name);
}
/*
* The modules that use fread() and fwrite() always invoke them through
* these macros. On some systems you may need to twiddle the argument casts.
* CAUTION: argument order is different from underlying functions!
#else /* _MSC_VER */
#include <errno.h>
/* This provides a similar interface to the Microsoft/C11 getenv_s() function,
* but other than parameter validation, it has no advantages over getenv().
*/
static INLINE int GETENV_S(char *buffer, size_t buffer_size, const char *name)
{
char *env;
if (!buffer) {
if (buffer_size == 0)
return 0;
else
return (errno = EINVAL);
}
if (buffer_size == 0)
return (errno = EINVAL);
if (!name) {
*buffer = 0;
return 0;
}
env = getenv(name);
if (!env)
{
*buffer = 0;
return 0;
}
if (strlen(env) + 1 > buffer_size) {
*buffer = 0;
return ERANGE;
}
strncpy(buffer, env, buffer_size);
return 0;
}
#endif /* _MSC_VER */
#endif /* NO_GETENV */
#ifndef NO_PUTENV
#ifdef _WIN32
#define PUTENV_S(name, value) _putenv_s(name, value)
#else
/* This provides a similar interface to the Microsoft _putenv_s() function, but
* other than parameter validation, it has no advantages over setenv().
*/
#define JFREAD(file, buf, sizeofbuf) \
((size_t)fread((void *)(buf), (size_t)1, (size_t)(sizeofbuf), (file)))
#define JFWRITE(file, buf, sizeofbuf) \
((size_t)fwrite((const void *)(buf), (size_t)1, (size_t)(sizeofbuf), (file)))
static INLINE int PUTENV_S(const char *name, const char *value)
{
if (!name || !value)
return (errno = EINVAL);
setenv(name, value, 1);
return errno;
}
#endif /* _WIN32 */
#endif /* NO_PUTENV */
#endif /* JINCLUDE_H */
......@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1997, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2016, 2021, D. R. Commander.
* Copyright (C) 2016, 2021-2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -37,12 +37,6 @@
#endif
#include <limits.h>
#ifndef NO_GETENV
#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare getenv() */
extern char *getenv(const char *name);
#endif
#endif
LOCAL(size_t)
round_up_pow2(size_t a, size_t b)
......@@ -1162,12 +1156,16 @@ jinit_memory_mgr(j_common_ptr cinfo)
*/
#ifndef NO_GETENV
{
char *memenv;
char memenv[30] = { 0 };
if ((memenv = getenv("JPEGMEM")) != NULL) {
if (!GETENV_S(memenv, 30, "JPEGMEM") && strlen(memenv) > 0) {
char ch = 'x';
#ifdef _MSC_VER
if (sscanf_s(memenv, "%ld%c", &max_to_use, &ch, 1) > 0) {
#else
if (sscanf(memenv, "%ld%c", &max_to_use, &ch) > 0) {
#endif
if (ch == 'm' || ch == 'M')
max_to_use *= 1000L;
mem->pub.max_memory_to_use = max_to_use * 1000L;
......
......@@ -22,11 +22,6 @@
#include "jpeglib.h"
#include "jmemsys.h" /* import the system-dependent declarations */
#ifndef HAVE_STDLIB_H /* <stdlib.h> should declare malloc(),free() */
extern void *malloc(size_t size);
extern void free(void *ptr);
#endif
/*
* Memory allocation and freeing are controlled by the regular library
......
......@@ -100,11 +100,7 @@ typedef unsigned char UINT8;
/* UINT16 must hold at least the values 0..65535. */
#ifdef HAVE_UNSIGNED_SHORT
typedef unsigned short UINT16;
#else /* not HAVE_UNSIGNED_SHORT */
typedef unsigned int UINT16;
#endif /* HAVE_UNSIGNED_SHORT */
/* INT16 must hold at least the values -32768..32767. */
......
......@@ -373,12 +373,3 @@ extern const int jpeg_natural_order[]; /* zigzag coef order to natural order */
/* Arithmetic coding probability estimation tables in jaricom.c */
extern const JLONG jpeg_aritab[];
/* Suppress undefined-structure complaints if necessary. */
#ifdef INCOMPLETE_TYPES_BROKEN
#ifndef AM_MEMORY_MANAGER /* only jmemmgr.c defines these */
struct jvirt_sarray_control { long dummy; };
struct jvirt_barray_control { long dummy; };
#endif
#endif /* INCOMPLETE_TYPES_BROKEN */
......@@ -4,7 +4,7 @@
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1998, Thomas G. Lane.
* libjpeg-turbo Modifications:
* Copyright (C) 2013, D. R. Commander.
* Copyright (C) 2013, 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -29,7 +29,7 @@ add_huff_table(j_common_ptr cinfo, JHUFF_TBL **htblptr, const UINT8 *bits,
return;
/* Copy the number-of-symbols-of-each-code-length counts */
MEMCOPY((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
memcpy((*htblptr)->bits, bits, sizeof((*htblptr)->bits));
/* Validate the counts. We do this here mainly so we can copy the right
* number of symbols from the val[] array, without risking marching off
......@@ -41,8 +41,9 @@ add_huff_table(j_common_ptr cinfo, JHUFF_TBL **htblptr, const UINT8 *bits,
if (nsymbols < 1 || nsymbols > 256)
ERREXIT(cinfo, JERR_BAD_HUFF_TABLE);
MEMCOPY((*htblptr)->huffval, val, nsymbols * sizeof(UINT8));
MEMZERO(&((*htblptr)->huffval[nsymbols]), (256 - nsymbols) * sizeof(UINT8));
memcpy((*htblptr)->huffval, val, nsymbols * sizeof(UINT8));
memset(&((*htblptr)->huffval[nsymbols]), 0,
(256 - nsymbols) * sizeof(UINT8));
/* Initialize sent_table FALSE so table will be written to JPEG file. */
(*htblptr)->sent_table = FALSE;
......
......@@ -3,8 +3,8 @@
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-1996, Thomas G. Lane.
* It was modified by The libjpeg-turbo Project to include only code
* relevant to libjpeg-turbo.
* libjpeg-turbo Modifications:
* Copyright (C) 2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
......@@ -110,7 +110,7 @@ jcopy_sample_rows(JSAMPARRAY input_array, int source_row,
for (row = num_rows; row > 0; row--) {
inptr = *input_array++;
outptr = *output_array++;
MEMCOPY(outptr, inptr, count);
memcpy(outptr, inptr, count);
}
}
......@@ -120,7 +120,7 @@ jcopy_block_row(JBLOCKROW input_row, JBLOCKROW output_row,
JDIMENSION num_blocks)
/* Copy a row of coefficient blocks from one place to another. */
{
MEMCOPY(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF)));
memcpy(output_row, input_row, num_blocks * (DCTSIZE2 * sizeof(JCOEF)));
}
......@@ -129,5 +129,5 @@ jzero_far(void *target, size_t bytestozero)
/* Zero out a chunk of memory. */
/* This might be sample-array data, block-array data, or alloc_large data. */
{
MEMZERO(target, bytestozero);
memset(target, 0, bytestozero);
}
/*
* jversion.h
*
* This file was part of the Independent JPEG Group's software:
* Copyright (C) 1991-2020, Thomas G. Lane, Guido Vollbeding.
* libjpeg-turbo Modifications:
* Copyright (C) 2010, 2012-2022, D. R. Commander.
* For conditions of distribution and use, see the accompanying README.ijg
* file.
*
* This file contains software version identification.
*/
#if JPEG_LIB_VERSION >= 80
#define JVERSION "8d 15-Jan-2012"
#elif JPEG_LIB_VERSION >= 70
#define JVERSION "7 27-Jun-2009"
#else
#define JVERSION "6b 27-Mar-1998"
#endif
/*
* NOTE: It is our convention to place the authors in the following order:
* - libjpeg-turbo authors (2009-) in descending order of the date of their
* most recent contribution to the project, then in ascending order of the
* date of their first contribution to the project, then in alphabetical
* order
* - Upstream authors in descending order of the date of the first inclusion of
* their code
*/
#define JCOPYRIGHT \
"Copyright (C) 2009-2022 D. R. Commander\n" \
"Copyright (C) 2015, 2020 Google, Inc.\n" \
"Copyright (C) 2019-2020 Arm Limited\n" \
"Copyright (C) 2015-2016, 2018 Matthieu Darbois\n" \
"Copyright (C) 2011-2016 Siarhei Siamashka\n" \
"Copyright (C) 2015 Intel Corporation\n" \
"Copyright (C) 2013-2014 Linaro Limited\n" \
"Copyright (C) 2013-2014 MIPS Technologies, Inc.\n" \
"Copyright (C) 2009, 2012 Pierre Ossman for Cendio AB\n" \
"Copyright (C) 2009-2011 Nokia Corporation and/or its subsidiary(-ies)\n" \
"Copyright (C) 1999-2006 MIYASAKA Masaru\n" \
"Copyright (C) 1991-2020 Thomas G. Lane, Guido Vollbeding"
#define JCOPYRIGHT_SHORT \
"Copyright (C) @COPYRIGHT_YEAR@ The libjpeg-turbo Project and many others"
此差异已折叠。
/*
* jccolext-neon.c - colorspace conversion (32-bit Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jccolor-neon.c */
/* RGB -> YCbCr conversion is defined by the following equations:
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
*
* Avoid floating point arithmetic by using shifted integer constants:
* 0.29899597 = 19595 * 2^-16
* 0.58700561 = 38470 * 2^-16
* 0.11399841 = 7471 * 2^-16
* 0.16874695 = 11059 * 2^-16
* 0.33125305 = 21709 * 2^-16
* 0.50000000 = 32768 * 2^-16
* 0.41868592 = 27439 * 2^-16
* 0.08131409 = 5329 * 2^-16
* These constants are defined in jccolor-neon.c
*
* We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
* rounds up or down the result via integer truncation.
*/
void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
/* Pointer to RGB(X/A) input data */
JSAMPROW inptr;
/* Pointers to Y, Cb, and Cr output data */
JSAMPROW outptr0, outptr1, outptr2;
/* Allocate temporary buffer for final (image_width % 8) pixels in row. */
ALIGN(16) uint8_t tmp_buf[8 * RGB_PIXELSIZE];
/* Set up conversion constants. */
#ifdef HAVE_VLD1_U16_X2
const uint16x4x2_t consts = vld1_u16_x2(jsimd_rgb_ycc_neon_consts);
#else
/* GCC does not currently support the intrinsic vld1_<type>_x2(). */
const uint16x4_t consts1 = vld1_u16(jsimd_rgb_ycc_neon_consts);
const uint16x4_t consts2 = vld1_u16(jsimd_rgb_ycc_neon_consts + 4);
const uint16x4x2_t consts = { { consts1, consts2 } };
#endif
const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr0 = output_buf[0][output_row];
outptr1 = output_buf[1][output_row];
outptr2 = output_buf[2][output_row];
output_row++;
int cols_remaining = image_width;
for (; cols_remaining > 0; cols_remaining -= 8) {
/* To prevent buffer overread by the vector load instructions, the last
* (image_width % 8) columns of data are first memcopied to a temporary
* buffer large enough to accommodate the vector load.
*/
if (cols_remaining < 8) {
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
}
#if RGB_PIXELSIZE == 4
uint8x8x4_t input_pixels = vld4_u8(inptr);
#else
uint8x8x3_t input_pixels = vld3_u8(inptr);
#endif
uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
uint32x4_t y_low = vmull_lane_u16(vget_low_u16(r), consts.val[0], 0);
y_low = vmlal_lane_u16(y_low, vget_low_u16(g), consts.val[0], 1);
y_low = vmlal_lane_u16(y_low, vget_low_u16(b), consts.val[0], 2);
uint32x4_t y_high = vmull_lane_u16(vget_high_u16(r), consts.val[0], 0);
y_high = vmlal_lane_u16(y_high, vget_high_u16(g), consts.val[0], 1);
y_high = vmlal_lane_u16(y_high, vget_high_u16(b), consts.val[0], 2);
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
uint32x4_t cb_low = scaled_128_5;
cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(r), consts.val[0], 3);
cb_low = vmlsl_lane_u16(cb_low, vget_low_u16(g), consts.val[1], 0);
cb_low = vmlal_lane_u16(cb_low, vget_low_u16(b), consts.val[1], 1);
uint32x4_t cb_high = scaled_128_5;
cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(r), consts.val[0], 3);
cb_high = vmlsl_lane_u16(cb_high, vget_high_u16(g), consts.val[1], 0);
cb_high = vmlal_lane_u16(cb_high, vget_high_u16(b), consts.val[1], 1);
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
uint32x4_t cr_low = scaled_128_5;
cr_low = vmlal_lane_u16(cr_low, vget_low_u16(r), consts.val[1], 1);
cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(g), consts.val[1], 2);
cr_low = vmlsl_lane_u16(cr_low, vget_low_u16(b), consts.val[1], 3);
uint32x4_t cr_high = scaled_128_5;
cr_high = vmlal_lane_u16(cr_high, vget_high_u16(r), consts.val[1], 1);
cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(g), consts.val[1], 2);
cr_high = vmlsl_lane_u16(cr_high, vget_high_u16(b), consts.val[1], 3);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_low, 16),
vrshrn_n_u32(y_high, 16));
/* Descale Cb values (right shift) and narrow to 16-bit. */
uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_low, 16),
vshrn_n_u32(cb_high, 16));
/* Descale Cr values (right shift) and narrow to 16-bit. */
uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_low, 16),
vshrn_n_u32(cr_high, 16));
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
*/
vst1_u8(outptr0, vmovn_u16(y_u16));
vst1_u8(outptr1, vmovn_u16(cb_u16));
vst1_u8(outptr2, vmovn_u16(cr_u16));
/* Increment pointers. */
inptr += (8 * RGB_PIXELSIZE);
outptr0 += 8;
outptr1 += 8;
outptr2 += 8;
}
}
}
/*
* jchuff-neon.c - Huffman entropy encoding (32-bit Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* NOTE: All referenced figures are from
* Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
*/
#define JPEG_INTERNALS
#include "../../../jinclude.h"
#include "../../../jpeglib.h"
#include "../../../jsimd.h"
#include "../../../jdct.h"
#include "../../../jsimddct.h"
#include "../../jsimd.h"
#include "../jchuff.h"
#include "neon-compat.h"
#include <limits.h>
#include <arm_neon.h>
JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
JCOEFPTR block, int last_dc_val,
c_derived_tbl *dctbl,
c_derived_tbl *actbl)
{
uint8_t block_nbits[DCTSIZE2];
uint16_t block_diff[DCTSIZE2];
/* Load rows of coefficients from DCT block in zig-zag order. */
/* Compute DC coefficient difference value. (F.1.1.5.1) */
int16x8_t row0 = vdupq_n_s16(block[0] - last_dc_val);
row0 = vld1q_lane_s16(block + 1, row0, 1);
row0 = vld1q_lane_s16(block + 8, row0, 2);
row0 = vld1q_lane_s16(block + 16, row0, 3);
row0 = vld1q_lane_s16(block + 9, row0, 4);
row0 = vld1q_lane_s16(block + 2, row0, 5);
row0 = vld1q_lane_s16(block + 3, row0, 6);
row0 = vld1q_lane_s16(block + 10, row0, 7);
int16x8_t row1 = vld1q_dup_s16(block + 17);
row1 = vld1q_lane_s16(block + 24, row1, 1);
row1 = vld1q_lane_s16(block + 32, row1, 2);
row1 = vld1q_lane_s16(block + 25, row1, 3);
row1 = vld1q_lane_s16(block + 18, row1, 4);
row1 = vld1q_lane_s16(block + 11, row1, 5);
row1 = vld1q_lane_s16(block + 4, row1, 6);
row1 = vld1q_lane_s16(block + 5, row1, 7);
int16x8_t row2 = vld1q_dup_s16(block + 12);
row2 = vld1q_lane_s16(block + 19, row2, 1);
row2 = vld1q_lane_s16(block + 26, row2, 2);
row2 = vld1q_lane_s16(block + 33, row2, 3);
row2 = vld1q_lane_s16(block + 40, row2, 4);
row2 = vld1q_lane_s16(block + 48, row2, 5);
row2 = vld1q_lane_s16(block + 41, row2, 6);
row2 = vld1q_lane_s16(block + 34, row2, 7);
int16x8_t row3 = vld1q_dup_s16(block + 27);
row3 = vld1q_lane_s16(block + 20, row3, 1);
row3 = vld1q_lane_s16(block + 13, row3, 2);
row3 = vld1q_lane_s16(block + 6, row3, 3);
row3 = vld1q_lane_s16(block + 7, row3, 4);
row3 = vld1q_lane_s16(block + 14, row3, 5);
row3 = vld1q_lane_s16(block + 21, row3, 6);
row3 = vld1q_lane_s16(block + 28, row3, 7);
int16x8_t abs_row0 = vabsq_s16(row0);
int16x8_t abs_row1 = vabsq_s16(row1);
int16x8_t abs_row2 = vabsq_s16(row2);
int16x8_t abs_row3 = vabsq_s16(row3);
int16x8_t row0_lz = vclzq_s16(abs_row0);
int16x8_t row1_lz = vclzq_s16(abs_row1);
int16x8_t row2_lz = vclzq_s16(abs_row2);
int16x8_t row3_lz = vclzq_s16(abs_row3);
/* Compute number of bits required to represent each coefficient. */
uint8x8_t row0_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row0_lz)));
uint8x8_t row1_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row1_lz)));
uint8x8_t row2_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row2_lz)));
uint8x8_t row3_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row3_lz)));
vst1_u8(block_nbits + 0 * DCTSIZE, row0_nbits);
vst1_u8(block_nbits + 1 * DCTSIZE, row1_nbits);
vst1_u8(block_nbits + 2 * DCTSIZE, row2_nbits);
vst1_u8(block_nbits + 3 * DCTSIZE, row3_nbits);
uint16x8_t row0_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row0, 15)),
vnegq_s16(row0_lz));
uint16x8_t row1_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row1, 15)),
vnegq_s16(row1_lz));
uint16x8_t row2_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row2, 15)),
vnegq_s16(row2_lz));
uint16x8_t row3_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row3, 15)),
vnegq_s16(row3_lz));
uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1), row1_mask);
uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2), row2_mask);
uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3), row3_mask);
/* Store diff values for rows 0, 1, 2, and 3. */
vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
/* Load last four rows of coefficients from DCT block in zig-zag order. */
int16x8_t row4 = vld1q_dup_s16(block + 35);
row4 = vld1q_lane_s16(block + 42, row4, 1);
row4 = vld1q_lane_s16(block + 49, row4, 2);
row4 = vld1q_lane_s16(block + 56, row4, 3);
row4 = vld1q_lane_s16(block + 57, row4, 4);
row4 = vld1q_lane_s16(block + 50, row4, 5);
row4 = vld1q_lane_s16(block + 43, row4, 6);
row4 = vld1q_lane_s16(block + 36, row4, 7);
int16x8_t row5 = vld1q_dup_s16(block + 29);
row5 = vld1q_lane_s16(block + 22, row5, 1);
row5 = vld1q_lane_s16(block + 15, row5, 2);
row5 = vld1q_lane_s16(block + 23, row5, 3);
row5 = vld1q_lane_s16(block + 30, row5, 4);
row5 = vld1q_lane_s16(block + 37, row5, 5);
row5 = vld1q_lane_s16(block + 44, row5, 6);
row5 = vld1q_lane_s16(block + 51, row5, 7);
int16x8_t row6 = vld1q_dup_s16(block + 58);
row6 = vld1q_lane_s16(block + 59, row6, 1);
row6 = vld1q_lane_s16(block + 52, row6, 2);
row6 = vld1q_lane_s16(block + 45, row6, 3);
row6 = vld1q_lane_s16(block + 38, row6, 4);
row6 = vld1q_lane_s16(block + 31, row6, 5);
row6 = vld1q_lane_s16(block + 39, row6, 6);
row6 = vld1q_lane_s16(block + 46, row6, 7);
int16x8_t row7 = vld1q_dup_s16(block + 53);
row7 = vld1q_lane_s16(block + 60, row7, 1);
row7 = vld1q_lane_s16(block + 61, row7, 2);
row7 = vld1q_lane_s16(block + 54, row7, 3);
row7 = vld1q_lane_s16(block + 47, row7, 4);
row7 = vld1q_lane_s16(block + 55, row7, 5);
row7 = vld1q_lane_s16(block + 62, row7, 6);
row7 = vld1q_lane_s16(block + 63, row7, 7);
int16x8_t abs_row4 = vabsq_s16(row4);
int16x8_t abs_row5 = vabsq_s16(row5);
int16x8_t abs_row6 = vabsq_s16(row6);
int16x8_t abs_row7 = vabsq_s16(row7);
int16x8_t row4_lz = vclzq_s16(abs_row4);
int16x8_t row5_lz = vclzq_s16(abs_row5);
int16x8_t row6_lz = vclzq_s16(abs_row6);
int16x8_t row7_lz = vclzq_s16(abs_row7);
/* Compute number of bits required to represent each coefficient. */
uint8x8_t row4_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row4_lz)));
uint8x8_t row5_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row5_lz)));
uint8x8_t row6_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row6_lz)));
uint8x8_t row7_nbits = vsub_u8(vdup_n_u8(16),
vmovn_u16(vreinterpretq_u16_s16(row7_lz)));
vst1_u8(block_nbits + 4 * DCTSIZE, row4_nbits);
vst1_u8(block_nbits + 5 * DCTSIZE, row5_nbits);
vst1_u8(block_nbits + 6 * DCTSIZE, row6_nbits);
vst1_u8(block_nbits + 7 * DCTSIZE, row7_nbits);
uint16x8_t row4_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row4, 15)),
vnegq_s16(row4_lz));
uint16x8_t row5_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row5, 15)),
vnegq_s16(row5_lz));
uint16x8_t row6_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row6, 15)),
vnegq_s16(row6_lz));
uint16x8_t row7_mask =
vshlq_u16(vreinterpretq_u16_s16(vshrq_n_s16(row7, 15)),
vnegq_s16(row7_lz));
uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4), row4_mask);
uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5), row5_mask);
uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6), row6_mask);
uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7), row7_mask);
/* Store diff values for rows 4, 5, 6, and 7. */
vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
/* Construct bitmap to accelerate encoding of AC coefficients. A set bit
* means that the corresponding coefficient != 0.
*/
uint8x8_t row0_nbits_gt0 = vcgt_u8(row0_nbits, vdup_n_u8(0));
uint8x8_t row1_nbits_gt0 = vcgt_u8(row1_nbits, vdup_n_u8(0));
uint8x8_t row2_nbits_gt0 = vcgt_u8(row2_nbits, vdup_n_u8(0));
uint8x8_t row3_nbits_gt0 = vcgt_u8(row3_nbits, vdup_n_u8(0));
uint8x8_t row4_nbits_gt0 = vcgt_u8(row4_nbits, vdup_n_u8(0));
uint8x8_t row5_nbits_gt0 = vcgt_u8(row5_nbits, vdup_n_u8(0));
uint8x8_t row6_nbits_gt0 = vcgt_u8(row6_nbits, vdup_n_u8(0));
uint8x8_t row7_nbits_gt0 = vcgt_u8(row7_nbits, vdup_n_u8(0));
/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
const uint8x8_t bitmap_mask =
vreinterpret_u8_u64(vmov_n_u64(0x0102040810204080));
row0_nbits_gt0 = vand_u8(row0_nbits_gt0, bitmap_mask);
row1_nbits_gt0 = vand_u8(row1_nbits_gt0, bitmap_mask);
row2_nbits_gt0 = vand_u8(row2_nbits_gt0, bitmap_mask);
row3_nbits_gt0 = vand_u8(row3_nbits_gt0, bitmap_mask);
row4_nbits_gt0 = vand_u8(row4_nbits_gt0, bitmap_mask);
row5_nbits_gt0 = vand_u8(row5_nbits_gt0, bitmap_mask);
row6_nbits_gt0 = vand_u8(row6_nbits_gt0, bitmap_mask);
row7_nbits_gt0 = vand_u8(row7_nbits_gt0, bitmap_mask);
uint8x8_t bitmap_rows_10 = vpadd_u8(row1_nbits_gt0, row0_nbits_gt0);
uint8x8_t bitmap_rows_32 = vpadd_u8(row3_nbits_gt0, row2_nbits_gt0);
uint8x8_t bitmap_rows_54 = vpadd_u8(row5_nbits_gt0, row4_nbits_gt0);
uint8x8_t bitmap_rows_76 = vpadd_u8(row7_nbits_gt0, row6_nbits_gt0);
uint8x8_t bitmap_rows_3210 = vpadd_u8(bitmap_rows_32, bitmap_rows_10);
uint8x8_t bitmap_rows_7654 = vpadd_u8(bitmap_rows_76, bitmap_rows_54);
uint8x8_t bitmap = vpadd_u8(bitmap_rows_7654, bitmap_rows_3210);
/* Shift left to remove DC bit. */
bitmap = vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap), 1));
/* Move bitmap to 32-bit scalar registers. */
uint32_t bitmap_1_32 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 1);
uint32_t bitmap_33_63 = vget_lane_u32(vreinterpret_u32_u8(bitmap), 0);
/* Set up state and bit buffer for output bitstream. */
working_state *state_ptr = (working_state *)state;
int free_bits = state_ptr->cur.free_bits;
size_t put_buffer = state_ptr->cur.put_buffer;
/* Encode DC coefficient. */
unsigned int nbits = block_nbits[0];
/* Emit Huffman-coded symbol and additional diff bits. */
unsigned int diff = block_diff[0];
PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
/* Encode AC coefficients. */
unsigned int r = 0; /* r = run length of zeros */
unsigned int i = 1; /* i = number of coefficients encoded */
/* Code and size information for a run length of 16 zero coefficients */
const unsigned int code_0xf0 = actbl->ehufco[0xf0];
const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
while (bitmap_1_32 != 0) {
r = BUILTIN_CLZ(bitmap_1_32);
i += r;
bitmap_1_32 <<= r;
nbits = block_nbits[i];
diff = block_diff[i];
while (r > 15) {
/* If run length > 15, emit special run-length-16 codes. */
PUT_BITS(code_0xf0, size_0xf0)
r -= 16;
}
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
unsigned int rs = (r << 4) + nbits;
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
i++;
bitmap_1_32 <<= 1;
}
r = 33 - i;
i = 33;
while (bitmap_33_63 != 0) {
unsigned int leading_zeros = BUILTIN_CLZ(bitmap_33_63);
r += leading_zeros;
i += leading_zeros;
bitmap_33_63 <<= leading_zeros;
nbits = block_nbits[i];
diff = block_diff[i];
while (r > 15) {
/* If run length > 15, emit special run-length-16 codes. */
PUT_BITS(code_0xf0, size_0xf0)
r -= 16;
}
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
unsigned int rs = (r << 4) + nbits;
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
r = 0;
i++;
bitmap_33_63 <<= 1;
}
/* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
* The value of RS for the EOB code is 0.
*/
if (i != 64) {
PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
}
state_ptr->cur.put_buffer = put_buffer;
state_ptr->cur.free_bits = free_bits;
return buffer;
}
此差异已折叠。
/*
* jccolext-neon.c - colorspace conversion (64-bit Arm Neon)
*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* This file is included by jccolor-neon.c */
/* RGB -> YCbCr conversion is defined by the following equations:
* Y = 0.29900 * R + 0.58700 * G + 0.11400 * B
* Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128
* Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128
*
* Avoid floating point arithmetic by using shifted integer constants:
* 0.29899597 = 19595 * 2^-16
* 0.58700561 = 38470 * 2^-16
* 0.11399841 = 7471 * 2^-16
* 0.16874695 = 11059 * 2^-16
* 0.33125305 = 21709 * 2^-16
* 0.50000000 = 32768 * 2^-16
* 0.41868592 = 27439 * 2^-16
* 0.08131409 = 5329 * 2^-16
* These constants are defined in jccolor-neon.c
*
* We add the fixed-point equivalent of 0.5 to Cb and Cr, which effectively
* rounds up or down the result via integer truncation.
*/
void jsimd_rgb_ycc_convert_neon(JDIMENSION image_width, JSAMPARRAY input_buf,
JSAMPIMAGE output_buf, JDIMENSION output_row,
int num_rows)
{
/* Pointer to RGB(X/A) input data */
JSAMPROW inptr;
/* Pointers to Y, Cb, and Cr output data */
JSAMPROW outptr0, outptr1, outptr2;
/* Allocate temporary buffer for final (image_width % 16) pixels in row. */
ALIGN(16) uint8_t tmp_buf[16 * RGB_PIXELSIZE];
/* Set up conversion constants. */
const uint16x8_t consts = vld1q_u16(jsimd_rgb_ycc_neon_consts);
const uint32x4_t scaled_128_5 = vdupq_n_u32((128 << 16) + 32767);
while (--num_rows >= 0) {
inptr = *input_buf++;
outptr0 = output_buf[0][output_row];
outptr1 = output_buf[1][output_row];
outptr2 = output_buf[2][output_row];
output_row++;
int cols_remaining = image_width;
for (; cols_remaining >= 16; cols_remaining -= 16) {
#if RGB_PIXELSIZE == 4
uint8x16x4_t input_pixels = vld4q_u8(inptr);
#else
uint8x16x3_t input_pixels = vld3q_u8(inptr);
#endif
uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
uint32x4_t cb_ll = scaled_128_5;
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
uint32x4_t cb_lh = scaled_128_5;
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
uint32x4_t cb_hl = scaled_128_5;
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
uint32x4_t cb_hh = scaled_128_5;
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
uint32x4_t cr_ll = scaled_128_5;
cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
uint32x4_t cr_lh = scaled_128_5;
cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
uint32x4_t cr_hl = scaled_128_5;
cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
uint32x4_t cr_hh = scaled_128_5;
cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
vrshrn_n_u32(y_lh, 16));
uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
vrshrn_n_u32(y_hh, 16));
/* Descale Cb values (right shift) and narrow to 16-bit. */
uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
vshrn_n_u32(cb_lh, 16));
uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
vshrn_n_u32(cb_hh, 16));
/* Descale Cr values (right shift) and narrow to 16-bit. */
uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
vshrn_n_u32(cr_lh, 16));
uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
vshrn_n_u32(cr_hh, 16));
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
*/
vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
/* Increment pointers. */
inptr += (16 * RGB_PIXELSIZE);
outptr0 += 16;
outptr1 += 16;
outptr2 += 16;
}
if (cols_remaining > 8) {
/* To prevent buffer overread by the vector load instructions, the last
* (image_width % 16) columns of data are first memcopied to a temporary
* buffer large enough to accommodate the vector load.
*/
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
#if RGB_PIXELSIZE == 4
uint8x16x4_t input_pixels = vld4q_u8(inptr);
#else
uint8x16x3_t input_pixels = vld3q_u8(inptr);
#endif
uint16x8_t r_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_RED]));
uint16x8_t g_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t b_l = vmovl_u8(vget_low_u8(input_pixels.val[RGB_BLUE]));
uint16x8_t r_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_RED]));
uint16x8_t g_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_GREEN]));
uint16x8_t b_h = vmovl_u8(vget_high_u8(input_pixels.val[RGB_BLUE]));
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
uint32x4_t y_ll = vmull_laneq_u16(vget_low_u16(r_l), consts, 0);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(g_l), consts, 1);
y_ll = vmlal_laneq_u16(y_ll, vget_low_u16(b_l), consts, 2);
uint32x4_t y_lh = vmull_laneq_u16(vget_high_u16(r_l), consts, 0);
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(g_l), consts, 1);
y_lh = vmlal_laneq_u16(y_lh, vget_high_u16(b_l), consts, 2);
uint32x4_t y_hl = vmull_laneq_u16(vget_low_u16(r_h), consts, 0);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(g_h), consts, 1);
y_hl = vmlal_laneq_u16(y_hl, vget_low_u16(b_h), consts, 2);
uint32x4_t y_hh = vmull_laneq_u16(vget_high_u16(r_h), consts, 0);
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(g_h), consts, 1);
y_hh = vmlal_laneq_u16(y_hh, vget_high_u16(b_h), consts, 2);
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
uint32x4_t cb_ll = scaled_128_5;
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(r_l), consts, 3);
cb_ll = vmlsl_laneq_u16(cb_ll, vget_low_u16(g_l), consts, 4);
cb_ll = vmlal_laneq_u16(cb_ll, vget_low_u16(b_l), consts, 5);
uint32x4_t cb_lh = scaled_128_5;
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(r_l), consts, 3);
cb_lh = vmlsl_laneq_u16(cb_lh, vget_high_u16(g_l), consts, 4);
cb_lh = vmlal_laneq_u16(cb_lh, vget_high_u16(b_l), consts, 5);
uint32x4_t cb_hl = scaled_128_5;
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(r_h), consts, 3);
cb_hl = vmlsl_laneq_u16(cb_hl, vget_low_u16(g_h), consts, 4);
cb_hl = vmlal_laneq_u16(cb_hl, vget_low_u16(b_h), consts, 5);
uint32x4_t cb_hh = scaled_128_5;
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(r_h), consts, 3);
cb_hh = vmlsl_laneq_u16(cb_hh, vget_high_u16(g_h), consts, 4);
cb_hh = vmlal_laneq_u16(cb_hh, vget_high_u16(b_h), consts, 5);
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
uint32x4_t cr_ll = scaled_128_5;
cr_ll = vmlal_laneq_u16(cr_ll, vget_low_u16(r_l), consts, 5);
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(g_l), consts, 6);
cr_ll = vmlsl_laneq_u16(cr_ll, vget_low_u16(b_l), consts, 7);
uint32x4_t cr_lh = scaled_128_5;
cr_lh = vmlal_laneq_u16(cr_lh, vget_high_u16(r_l), consts, 5);
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(g_l), consts, 6);
cr_lh = vmlsl_laneq_u16(cr_lh, vget_high_u16(b_l), consts, 7);
uint32x4_t cr_hl = scaled_128_5;
cr_hl = vmlal_laneq_u16(cr_hl, vget_low_u16(r_h), consts, 5);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(g_h), consts, 6);
cr_hl = vmlsl_laneq_u16(cr_hl, vget_low_u16(b_h), consts, 7);
uint32x4_t cr_hh = scaled_128_5;
cr_hh = vmlal_laneq_u16(cr_hh, vget_high_u16(r_h), consts, 5);
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(g_h), consts, 6);
cr_hh = vmlsl_laneq_u16(cr_hh, vget_high_u16(b_h), consts, 7);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_l = vcombine_u16(vrshrn_n_u32(y_ll, 16),
vrshrn_n_u32(y_lh, 16));
uint16x8_t y_h = vcombine_u16(vrshrn_n_u32(y_hl, 16),
vrshrn_n_u32(y_hh, 16));
/* Descale Cb values (right shift) and narrow to 16-bit. */
uint16x8_t cb_l = vcombine_u16(vshrn_n_u32(cb_ll, 16),
vshrn_n_u32(cb_lh, 16));
uint16x8_t cb_h = vcombine_u16(vshrn_n_u32(cb_hl, 16),
vshrn_n_u32(cb_hh, 16));
/* Descale Cr values (right shift) and narrow to 16-bit. */
uint16x8_t cr_l = vcombine_u16(vshrn_n_u32(cr_ll, 16),
vshrn_n_u32(cr_lh, 16));
uint16x8_t cr_h = vcombine_u16(vshrn_n_u32(cr_hl, 16),
vshrn_n_u32(cr_hh, 16));
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
*/
vst1q_u8(outptr0, vcombine_u8(vmovn_u16(y_l), vmovn_u16(y_h)));
vst1q_u8(outptr1, vcombine_u8(vmovn_u16(cb_l), vmovn_u16(cb_h)));
vst1q_u8(outptr2, vcombine_u8(vmovn_u16(cr_l), vmovn_u16(cr_h)));
} else if (cols_remaining > 0) {
/* To prevent buffer overread by the vector load instructions, the last
* (image_width % 8) columns of data are first memcopied to a temporary
* buffer large enough to accommodate the vector load.
*/
memcpy(tmp_buf, inptr, cols_remaining * RGB_PIXELSIZE);
inptr = tmp_buf;
#if RGB_PIXELSIZE == 4
uint8x8x4_t input_pixels = vld4_u8(inptr);
#else
uint8x8x3_t input_pixels = vld3_u8(inptr);
#endif
uint16x8_t r = vmovl_u8(input_pixels.val[RGB_RED]);
uint16x8_t g = vmovl_u8(input_pixels.val[RGB_GREEN]);
uint16x8_t b = vmovl_u8(input_pixels.val[RGB_BLUE]);
/* Compute Y = 0.29900 * R + 0.58700 * G + 0.11400 * B */
uint32x4_t y_l = vmull_laneq_u16(vget_low_u16(r), consts, 0);
y_l = vmlal_laneq_u16(y_l, vget_low_u16(g), consts, 1);
y_l = vmlal_laneq_u16(y_l, vget_low_u16(b), consts, 2);
uint32x4_t y_h = vmull_laneq_u16(vget_high_u16(r), consts, 0);
y_h = vmlal_laneq_u16(y_h, vget_high_u16(g), consts, 1);
y_h = vmlal_laneq_u16(y_h, vget_high_u16(b), consts, 2);
/* Compute Cb = -0.16874 * R - 0.33126 * G + 0.50000 * B + 128 */
uint32x4_t cb_l = scaled_128_5;
cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(r), consts, 3);
cb_l = vmlsl_laneq_u16(cb_l, vget_low_u16(g), consts, 4);
cb_l = vmlal_laneq_u16(cb_l, vget_low_u16(b), consts, 5);
uint32x4_t cb_h = scaled_128_5;
cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(r), consts, 3);
cb_h = vmlsl_laneq_u16(cb_h, vget_high_u16(g), consts, 4);
cb_h = vmlal_laneq_u16(cb_h, vget_high_u16(b), consts, 5);
/* Compute Cr = 0.50000 * R - 0.41869 * G - 0.08131 * B + 128 */
uint32x4_t cr_l = scaled_128_5;
cr_l = vmlal_laneq_u16(cr_l, vget_low_u16(r), consts, 5);
cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(g), consts, 6);
cr_l = vmlsl_laneq_u16(cr_l, vget_low_u16(b), consts, 7);
uint32x4_t cr_h = scaled_128_5;
cr_h = vmlal_laneq_u16(cr_h, vget_high_u16(r), consts, 5);
cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(g), consts, 6);
cr_h = vmlsl_laneq_u16(cr_h, vget_high_u16(b), consts, 7);
/* Descale Y values (rounding right shift) and narrow to 16-bit. */
uint16x8_t y_u16 = vcombine_u16(vrshrn_n_u32(y_l, 16),
vrshrn_n_u32(y_h, 16));
/* Descale Cb values (right shift) and narrow to 16-bit. */
uint16x8_t cb_u16 = vcombine_u16(vshrn_n_u32(cb_l, 16),
vshrn_n_u32(cb_h, 16));
/* Descale Cr values (right shift) and narrow to 16-bit. */
uint16x8_t cr_u16 = vcombine_u16(vshrn_n_u32(cr_l, 16),
vshrn_n_u32(cr_h, 16));
/* Narrow Y, Cb, and Cr values to 8-bit and store to memory. Buffer
* overwrite is permitted up to the next multiple of ALIGN_SIZE bytes.
*/
vst1_u8(outptr0, vmovn_u16(y_u16));
vst1_u8(outptr1, vmovn_u16(cb_u16));
vst1_u8(outptr2, vmovn_u16(cr_u16));
}
}
}
/*
* jchuff-neon.c - Huffman entropy encoding (64-bit Arm Neon)
*
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
* Copyright (C) 2020, 2022, D. R. Commander. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*
* NOTE: All referenced figures are from
* Recommendation ITU-T T.81 (1992) | ISO/IEC 10918-1:1994.
*/
#define JPEG_INTERNALS
#include "../../../jinclude.h"
#include "../../../jpeglib.h"
#include "../../../jsimd.h"
#include "../../../jdct.h"
#include "../../../jsimddct.h"
#include "../../jsimd.h"
#include "../align.h"
#include "../jchuff.h"
#include "neon-compat.h"
#include <limits.h>
#include <arm_neon.h>
ALIGN(16) static const uint8_t jsimd_huff_encode_one_block_consts[] = {
0, 1, 2, 3, 16, 17, 32, 33,
18, 19, 4, 5, 6, 7, 20, 21,
34, 35, 48, 49, 255, 255, 50, 51,
36, 37, 22, 23, 8, 9, 10, 11,
255, 255, 6, 7, 20, 21, 34, 35,
48, 49, 255, 255, 50, 51, 36, 37,
54, 55, 40, 41, 26, 27, 12, 13,
14, 15, 28, 29, 42, 43, 56, 57,
6, 7, 20, 21, 34, 35, 48, 49,
50, 51, 36, 37, 22, 23, 8, 9,
26, 27, 12, 13, 255, 255, 14, 15,
28, 29, 42, 43, 56, 57, 255, 255,
52, 53, 54, 55, 40, 41, 26, 27,
12, 13, 255, 255, 14, 15, 28, 29,
26, 27, 40, 41, 42, 43, 28, 29,
14, 15, 30, 31, 44, 45, 46, 47
};
/* The AArch64 implementation of the FLUSH() macro triggers a UBSan misaligned
* address warning because the macro sometimes writes a 64-bit value to a
* non-64-bit-aligned address. That behavior is technically undefined per
* the C specification, but it is supported by the AArch64 architecture and
* compilers.
*/
#if defined(__has_feature)
#if __has_feature(undefined_behavior_sanitizer)
__attribute__((no_sanitize("alignment")))
#endif
#endif
JOCTET *jsimd_huff_encode_one_block_neon(void *state, JOCTET *buffer,
JCOEFPTR block, int last_dc_val,
c_derived_tbl *dctbl,
c_derived_tbl *actbl)
{
uint16_t block_diff[DCTSIZE2];
/* Load lookup table indices for rows of zig-zag ordering. */
#ifdef HAVE_VLD1Q_U8_X4
const uint8x16x4_t idx_rows_0123 =
vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE);
const uint8x16x4_t idx_rows_4567 =
vld1q_u8_x4(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE);
#else
/* GCC does not currently support intrinsics vl1dq_<type>_x4(). */
const uint8x16x4_t idx_rows_0123 = { {
vld1q_u8(jsimd_huff_encode_one_block_consts + 0 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 2 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 4 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 6 * DCTSIZE)
} };
const uint8x16x4_t idx_rows_4567 = { {
vld1q_u8(jsimd_huff_encode_one_block_consts + 8 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 10 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 12 * DCTSIZE),
vld1q_u8(jsimd_huff_encode_one_block_consts + 14 * DCTSIZE)
} };
#endif
/* Load 8x8 block of DCT coefficients. */
#ifdef HAVE_VLD1Q_U8_X4
const int8x16x4_t tbl_rows_0123 =
vld1q_s8_x4((int8_t *)(block + 0 * DCTSIZE));
const int8x16x4_t tbl_rows_4567 =
vld1q_s8_x4((int8_t *)(block + 4 * DCTSIZE));
#else
const int8x16x4_t tbl_rows_0123 = { {
vld1q_s8((int8_t *)(block + 0 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 1 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 2 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 3 * DCTSIZE))
} };
const int8x16x4_t tbl_rows_4567 = { {
vld1q_s8((int8_t *)(block + 4 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 5 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 6 * DCTSIZE)),
vld1q_s8((int8_t *)(block + 7 * DCTSIZE))
} };
#endif
/* Initialise extra lookup tables. */
const int8x16x4_t tbl_rows_2345 = { {
tbl_rows_0123.val[2], tbl_rows_0123.val[3],
tbl_rows_4567.val[0], tbl_rows_4567.val[1]
} };
const int8x16x3_t tbl_rows_567 =
{ { tbl_rows_4567.val[1], tbl_rows_4567.val[2], tbl_rows_4567.val[3] } };
/* Shuffle coefficients into zig-zag order. */
int16x8_t row0 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[0]));
int16x8_t row1 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[1]));
int16x8_t row2 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_0123.val[2]));
int16x8_t row3 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_0123, idx_rows_0123.val[3]));
int16x8_t row4 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[0]));
int16x8_t row5 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_2345, idx_rows_4567.val[1]));
int16x8_t row6 =
vreinterpretq_s16_s8(vqtbl4q_s8(tbl_rows_4567, idx_rows_4567.val[2]));
int16x8_t row7 =
vreinterpretq_s16_s8(vqtbl3q_s8(tbl_rows_567, idx_rows_4567.val[3]));
/* Compute DC coefficient difference value (F.1.1.5.1). */
row0 = vsetq_lane_s16(block[0] - last_dc_val, row0, 0);
/* Initialize AC coefficient lanes not reachable by lookup tables. */
row1 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[0]),
0), row1, 2);
row2 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
4), row2, 0);
row2 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
0), row2, 5);
row5 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[1]),
7), row5, 2);
row5 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_4567.val[2]),
3), row5, 7);
row6 =
vsetq_lane_s16(vgetq_lane_s16(vreinterpretq_s16_s8(tbl_rows_0123.val[3]),
7), row6, 5);
/* DCT block is now in zig-zag order; start Huffman encoding process. */
/* Construct bitmap to accelerate encoding of AC coefficients. A set bit
* means that the corresponding coefficient != 0.
*/
uint16x8_t row0_ne_0 = vtstq_s16(row0, row0);
uint16x8_t row1_ne_0 = vtstq_s16(row1, row1);
uint16x8_t row2_ne_0 = vtstq_s16(row2, row2);
uint16x8_t row3_ne_0 = vtstq_s16(row3, row3);
uint16x8_t row4_ne_0 = vtstq_s16(row4, row4);
uint16x8_t row5_ne_0 = vtstq_s16(row5, row5);
uint16x8_t row6_ne_0 = vtstq_s16(row6, row6);
uint16x8_t row7_ne_0 = vtstq_s16(row7, row7);
uint8x16_t row10_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row1_ne_0),
vreinterpretq_u8_u16(row0_ne_0));
uint8x16_t row32_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row3_ne_0),
vreinterpretq_u8_u16(row2_ne_0));
uint8x16_t row54_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row5_ne_0),
vreinterpretq_u8_u16(row4_ne_0));
uint8x16_t row76_ne_0 = vuzp1q_u8(vreinterpretq_u8_u16(row7_ne_0),
vreinterpretq_u8_u16(row6_ne_0));
/* { 0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01 } */
const uint8x16_t bitmap_mask =
vreinterpretq_u8_u64(vdupq_n_u64(0x0102040810204080));
uint8x16_t bitmap_rows_10 = vandq_u8(row10_ne_0, bitmap_mask);
uint8x16_t bitmap_rows_32 = vandq_u8(row32_ne_0, bitmap_mask);
uint8x16_t bitmap_rows_54 = vandq_u8(row54_ne_0, bitmap_mask);
uint8x16_t bitmap_rows_76 = vandq_u8(row76_ne_0, bitmap_mask);
uint8x16_t bitmap_rows_3210 = vpaddq_u8(bitmap_rows_32, bitmap_rows_10);
uint8x16_t bitmap_rows_7654 = vpaddq_u8(bitmap_rows_76, bitmap_rows_54);
uint8x16_t bitmap_rows_76543210 = vpaddq_u8(bitmap_rows_7654,
bitmap_rows_3210);
uint8x8_t bitmap_all = vpadd_u8(vget_low_u8(bitmap_rows_76543210),
vget_high_u8(bitmap_rows_76543210));
/* Shift left to remove DC bit. */
bitmap_all =
vreinterpret_u8_u64(vshl_n_u64(vreinterpret_u64_u8(bitmap_all), 1));
/* Count bits set (number of non-zero coefficients) in bitmap. */
unsigned int non_zero_coefficients = vaddv_u8(vcnt_u8(bitmap_all));
/* Move bitmap to 64-bit scalar register. */
uint64_t bitmap = vget_lane_u64(vreinterpret_u64_u8(bitmap_all), 0);
/* Set up state and bit buffer for output bitstream. */
working_state *state_ptr = (working_state *)state;
int free_bits = state_ptr->cur.free_bits;
size_t put_buffer = state_ptr->cur.put_buffer;
/* Encode DC coefficient. */
/* For negative coeffs: diff = abs(coeff) -1 = ~abs(coeff) */
int16x8_t abs_row0 = vabsq_s16(row0);
int16x8_t row0_lz = vclzq_s16(abs_row0);
uint16x8_t row0_mask = vshlq_u16(vcltzq_s16(row0), vnegq_s16(row0_lz));
uint16x8_t row0_diff = veorq_u16(vreinterpretq_u16_s16(abs_row0), row0_mask);
/* Find nbits required to specify sign and amplitude of coefficient. */
unsigned int lz = vgetq_lane_u16(vreinterpretq_u16_s16(row0_lz), 0);
unsigned int nbits = 16 - lz;
/* Emit Huffman-coded symbol and additional diff bits. */
unsigned int diff = vgetq_lane_u16(row0_diff, 0);
PUT_CODE(dctbl->ehufco[nbits], dctbl->ehufsi[nbits], diff)
/* Encode AC coefficients. */
unsigned int r = 0; /* r = run length of zeros */
unsigned int i = 1; /* i = number of coefficients encoded */
/* Code and size information for a run length of 16 zero coefficients */
const unsigned int code_0xf0 = actbl->ehufco[0xf0];
const unsigned int size_0xf0 = actbl->ehufsi[0xf0];
/* The most efficient method of computing nbits and diff depends on the
* number of non-zero coefficients. If the bitmap is not too sparse (> 8
* non-zero AC coefficients), it is beneficial to do all of the work using
* Neon; else we do some of the work using Neon and the rest on demand using
* scalar code.
*/
if (non_zero_coefficients > 8) {
uint8_t block_nbits[DCTSIZE2];
int16x8_t abs_row1 = vabsq_s16(row1);
int16x8_t abs_row2 = vabsq_s16(row2);
int16x8_t abs_row3 = vabsq_s16(row3);
int16x8_t abs_row4 = vabsq_s16(row4);
int16x8_t abs_row5 = vabsq_s16(row5);
int16x8_t abs_row6 = vabsq_s16(row6);
int16x8_t abs_row7 = vabsq_s16(row7);
int16x8_t row1_lz = vclzq_s16(abs_row1);
int16x8_t row2_lz = vclzq_s16(abs_row2);
int16x8_t row3_lz = vclzq_s16(abs_row3);
int16x8_t row4_lz = vclzq_s16(abs_row4);
int16x8_t row5_lz = vclzq_s16(abs_row5);
int16x8_t row6_lz = vclzq_s16(abs_row6);
int16x8_t row7_lz = vclzq_s16(abs_row7);
/* Narrow leading zero count to 8 bits. */
uint8x16_t row01_lz = vuzp1q_u8(vreinterpretq_u8_s16(row0_lz),
vreinterpretq_u8_s16(row1_lz));
uint8x16_t row23_lz = vuzp1q_u8(vreinterpretq_u8_s16(row2_lz),
vreinterpretq_u8_s16(row3_lz));
uint8x16_t row45_lz = vuzp1q_u8(vreinterpretq_u8_s16(row4_lz),
vreinterpretq_u8_s16(row5_lz));
uint8x16_t row67_lz = vuzp1q_u8(vreinterpretq_u8_s16(row6_lz),
vreinterpretq_u8_s16(row7_lz));
/* Compute nbits needed to specify magnitude of each coefficient. */
uint8x16_t row01_nbits = vsubq_u8(vdupq_n_u8(16), row01_lz);
uint8x16_t row23_nbits = vsubq_u8(vdupq_n_u8(16), row23_lz);
uint8x16_t row45_nbits = vsubq_u8(vdupq_n_u8(16), row45_lz);
uint8x16_t row67_nbits = vsubq_u8(vdupq_n_u8(16), row67_lz);
/* Store nbits. */
vst1q_u8(block_nbits + 0 * DCTSIZE, row01_nbits);
vst1q_u8(block_nbits + 2 * DCTSIZE, row23_nbits);
vst1q_u8(block_nbits + 4 * DCTSIZE, row45_nbits);
vst1q_u8(block_nbits + 6 * DCTSIZE, row67_nbits);
/* Mask bits not required to specify sign and amplitude of diff. */
uint16x8_t row1_mask = vshlq_u16(vcltzq_s16(row1), vnegq_s16(row1_lz));
uint16x8_t row2_mask = vshlq_u16(vcltzq_s16(row2), vnegq_s16(row2_lz));
uint16x8_t row3_mask = vshlq_u16(vcltzq_s16(row3), vnegq_s16(row3_lz));
uint16x8_t row4_mask = vshlq_u16(vcltzq_s16(row4), vnegq_s16(row4_lz));
uint16x8_t row5_mask = vshlq_u16(vcltzq_s16(row5), vnegq_s16(row5_lz));
uint16x8_t row6_mask = vshlq_u16(vcltzq_s16(row6), vnegq_s16(row6_lz));
uint16x8_t row7_mask = vshlq_u16(vcltzq_s16(row7), vnegq_s16(row7_lz));
/* diff = abs(coeff) ^ sign(coeff) [no-op for positive coefficients] */
uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
row1_mask);
uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
row2_mask);
uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
row3_mask);
uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
row4_mask);
uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
row5_mask);
uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
row6_mask);
uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
row7_mask);
/* Store diff bits. */
vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
while (bitmap != 0) {
r = BUILTIN_CLZLL(bitmap);
i += r;
bitmap <<= r;
nbits = block_nbits[i];
diff = block_diff[i];
while (r > 15) {
/* If run length > 15, emit special run-length-16 codes. */
PUT_BITS(code_0xf0, size_0xf0)
r -= 16;
}
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
unsigned int rs = (r << 4) + nbits;
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
i++;
bitmap <<= 1;
}
} else if (bitmap != 0) {
uint16_t block_abs[DCTSIZE2];
/* Compute and store absolute value of coefficients. */
int16x8_t abs_row1 = vabsq_s16(row1);
int16x8_t abs_row2 = vabsq_s16(row2);
int16x8_t abs_row3 = vabsq_s16(row3);
int16x8_t abs_row4 = vabsq_s16(row4);
int16x8_t abs_row5 = vabsq_s16(row5);
int16x8_t abs_row6 = vabsq_s16(row6);
int16x8_t abs_row7 = vabsq_s16(row7);
vst1q_u16(block_abs + 0 * DCTSIZE, vreinterpretq_u16_s16(abs_row0));
vst1q_u16(block_abs + 1 * DCTSIZE, vreinterpretq_u16_s16(abs_row1));
vst1q_u16(block_abs + 2 * DCTSIZE, vreinterpretq_u16_s16(abs_row2));
vst1q_u16(block_abs + 3 * DCTSIZE, vreinterpretq_u16_s16(abs_row3));
vst1q_u16(block_abs + 4 * DCTSIZE, vreinterpretq_u16_s16(abs_row4));
vst1q_u16(block_abs + 5 * DCTSIZE, vreinterpretq_u16_s16(abs_row5));
vst1q_u16(block_abs + 6 * DCTSIZE, vreinterpretq_u16_s16(abs_row6));
vst1q_u16(block_abs + 7 * DCTSIZE, vreinterpretq_u16_s16(abs_row7));
/* Compute diff bits (without nbits mask) and store. */
uint16x8_t row1_diff = veorq_u16(vreinterpretq_u16_s16(abs_row1),
vcltzq_s16(row1));
uint16x8_t row2_diff = veorq_u16(vreinterpretq_u16_s16(abs_row2),
vcltzq_s16(row2));
uint16x8_t row3_diff = veorq_u16(vreinterpretq_u16_s16(abs_row3),
vcltzq_s16(row3));
uint16x8_t row4_diff = veorq_u16(vreinterpretq_u16_s16(abs_row4),
vcltzq_s16(row4));
uint16x8_t row5_diff = veorq_u16(vreinterpretq_u16_s16(abs_row5),
vcltzq_s16(row5));
uint16x8_t row6_diff = veorq_u16(vreinterpretq_u16_s16(abs_row6),
vcltzq_s16(row6));
uint16x8_t row7_diff = veorq_u16(vreinterpretq_u16_s16(abs_row7),
vcltzq_s16(row7));
vst1q_u16(block_diff + 0 * DCTSIZE, row0_diff);
vst1q_u16(block_diff + 1 * DCTSIZE, row1_diff);
vst1q_u16(block_diff + 2 * DCTSIZE, row2_diff);
vst1q_u16(block_diff + 3 * DCTSIZE, row3_diff);
vst1q_u16(block_diff + 4 * DCTSIZE, row4_diff);
vst1q_u16(block_diff + 5 * DCTSIZE, row5_diff);
vst1q_u16(block_diff + 6 * DCTSIZE, row6_diff);
vst1q_u16(block_diff + 7 * DCTSIZE, row7_diff);
/* Same as above but must mask diff bits and compute nbits on demand. */
while (bitmap != 0) {
r = BUILTIN_CLZLL(bitmap);
i += r;
bitmap <<= r;
lz = BUILTIN_CLZ(block_abs[i]);
nbits = 32 - lz;
diff = ((unsigned int)block_diff[i] << lz) >> lz;
while (r > 15) {
/* If run length > 15, emit special run-length-16 codes. */
PUT_BITS(code_0xf0, size_0xf0)
r -= 16;
}
/* Emit Huffman symbol for run length / number of bits. (F.1.2.2.1) */
unsigned int rs = (r << 4) + nbits;
PUT_CODE(actbl->ehufco[rs], actbl->ehufsi[rs], diff)
i++;
bitmap <<= 1;
}
}
/* If the last coefficient(s) were zero, emit an end-of-block (EOB) code.
* The value of RS for the EOB code is 0.
*/
if (i != 64) {
PUT_BITS(actbl->ehufco[0], actbl->ehufsi[0])
}
state_ptr->cur.put_buffer = put_buffer;
state_ptr->cur.free_bits = free_bits;
return buffer;
}
此差异已折叠。
/*
* Copyright (C) 2020, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
/* How to obtain memory alignment for structures and variables */
#if defined(_MSC_VER)
#define ALIGN(alignment) __declspec(align(alignment))
#elif defined(__clang__) || defined(__GNUC__)
#define ALIGN(alignment) __attribute__((aligned(alignment)))
#else
#error "Unknown compiler"
#endif
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
/*
* Copyright (C) 2020, D. R. Commander. All Rights Reserved.
* Copyright (C) 2020-2021, Arm Limited. All Rights Reserved.
*
* This software is provided 'as-is', without any express or implied
* warranty. In no event will the authors be held liable for any damages
* arising from the use of this software.
*
* Permission is granted to anyone to use this software for any purpose,
* including commercial applications, and to alter it and redistribute it
* freely, subject to the following restrictions:
*
* 1. The origin of this software must not be misrepresented; you must not
* claim that you wrote the original software. If you use this software
* in a product, an acknowledgment in the product documentation would be
* appreciated but is not required.
* 2. Altered source versions must be plainly marked as such, and must not be
* misrepresented as being the original software.
* 3. This notice may not be removed or altered from any source distribution.
*/
#cmakedefine HAVE_VLD1_S16_X3
#cmakedefine HAVE_VLD1_U16_X2
#cmakedefine HAVE_VLD1Q_U8_X4
/* Define compiler-independent count-leading-zeros and byte-swap macros */
#if defined(_MSC_VER) && !defined(__clang__)
#define BUILTIN_CLZ(x) _CountLeadingZeros(x)
#define BUILTIN_CLZLL(x) _CountLeadingZeros64(x)
#define BUILTIN_BSWAP64(x) _byteswap_uint64(x)
#elif defined(__clang__) || defined(__GNUC__)
#define BUILTIN_CLZ(x) __builtin_clz(x)
#define BUILTIN_CLZLL(x) __builtin_clzll(x)
#define BUILTIN_BSWAP64(x) __builtin_bswap64(x)
#else
#error "Unknown compiler"
#endif
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册